From bf1478e538bc52051b3e3394de24f9be75820ce8 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 23 May 2025 09:52:14 -0400 Subject: [PATCH 1/6] WIP optimizer refactor w/ pointers --- src/nf/nf_dense_layer.f90 | 7 +++ src/nf/nf_dense_layer_submodule.f90 | 9 ++++ src/nf/nf_network_submodule.f90 | 17 +++++-- src/nf/nf_optimizers.f90 | 76 ++++++++++++++++++++--------- 4 files changed, 82 insertions(+), 27 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 862f4cdf..462434f6 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -36,6 +36,7 @@ module nf_dense_layer procedure :: get_gradients procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -96,6 +97,12 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: w_ptr(:,:) + real, pointer :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a424cf9c..d0ac015a 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -77,6 +77,15 @@ module function get_params(self) result(params) end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: w_ptr(:,:) + real, pointer :: b_ptr(:) + w_ptr => self % weights + b_ptr => self % biases + end subroutine get_params_ptr + + module function get_gradients(self) result(gradients) class(dense_layer), intent(in), target :: self real, allocatable :: gradients(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index d8f5ff50..e7c39716 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,6 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) + real, pointer :: weights(:), biases(:), gradient(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the @@ -693,9 +694,19 @@ module subroutine update(self, optimizer, batch_size) end do #endif - params = self % get_params() - call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - call self % set_params(params) + !params = self % get_params() + !call self % optimizer % minimize(params, self % get_gradients() / batch_size_) + !call self % set_params(params) + + do n = 2, size(self % layers) + select type(this_layer => self % layers(n) % p) + type is(dense_layer) + call this_layer % get_params_ptr(weights, biases) + call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_) + !call this_layer % set_params(weights, biases) + end select + end do + ! Flush network gradients to zero. do n = 2, size(self % layers) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index c64cefed..1caf8c1e 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -30,11 +30,12 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize(self, param, gradient) + pure subroutine minimize(self, weights, biases, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) end subroutine minimize end interface @@ -116,12 +117,13 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd(self, param, gradient) + pure subroutine minimize_sgd(self, weights, biases, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) if (self % momentum > 0) then ! Apply momentum update @@ -129,14 +131,18 @@ pure subroutine minimize_sgd(self, param, gradient) - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - param = param + self % momentum * self % velocity & + weights = weights + self % momentum * self % velocity & + - self % learning_rate * gradient + biases = biases + self % momentum * self % velocity & - self % learning_rate * gradient else - param = param + self % velocity + weights = weights + self % velocity + biases = biases + self % velocity end if else ! Apply regular update - param = param - self % learning_rate * gradient + weights = weights - self % learning_rate * gradient + biases = biases - self % learning_rate * gradient end if end subroutine minimize_sgd @@ -152,18 +158,21 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop(self, param, gradient) + pure subroutine minimize_rmsprop(self, weights, biases, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) ! Compute the RMS of the gradient using the RMSProp rule self % rms_gradient = self % decay_rate * self % rms_gradient & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & + weights = weights - self % learning_rate & + / sqrt(self % rms_gradient + self % epsilon) * gradient + biases = biases - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient end subroutine minimize_rmsprop @@ -180,17 +189,18 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam(self, param, gradient) + pure subroutine minimize_adam(self, weights, biases, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. - associate(g => gradient + self % weight_decay_l2 * param) + associate(g => gradient + self % weight_decay_l2 * weights) self % m = self % beta1 * self % m + (1 - self % beta1) * g self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 end associate @@ -202,9 +212,15 @@ pure subroutine minimize_adam(self, param, gradient) ) ! Update parameters. - param = param & + weights = weights & - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) & - + self % weight_decay_decoupled * param) + + self % weight_decay_decoupled * weights) + + ! Update biases (without weight decay for biases) + associate(g => gradient) + biases = biases & + - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon)) + end associate end associate @@ -221,19 +237,21 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad(self, param, gradient) + pure subroutine minimize_adagrad(self, weights, biases, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) ! Update the current time step self % t = self % t + 1 + ! For weights associate( & ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adagrad. - g => gradient + self % weight_decay_l2 * param, & + g => gradient + self % weight_decay_l2 * weights, & ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & @@ -241,10 +259,20 @@ pure subroutine minimize_adagrad(self, param, gradient) self % sum_squared_gradient = self % sum_squared_gradient + g**2 - param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) & + self % epsilon) end associate + + ! For biases (without weight decay) + associate( & + g => gradient, & + learning_rate => self % learning_rate & + / (1 + (self % t - 1) * self % learning_rate_decay) & + ) + biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) & + + self % epsilon) + end associate end subroutine minimize_adagrad From 38896cc57abc017987f8b46b9650cb0ec3151545 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 27 May 2025 11:53:57 -0400 Subject: [PATCH 2/6] WIP optimizer optimization --- src/nf/nf_dense_layer.f90 | 7 + src/nf/nf_dense_layer_submodule.f90 | 9 ++ src/nf/nf_network_submodule.f90 | 6 +- src/nf/nf_optimizers.f90 | 201 ++++++++++++++++++++-------- 4 files changed, 164 insertions(+), 59 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 462434f6..ba6c33c4 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -34,6 +34,7 @@ module nf_dense_layer procedure :: backward procedure :: forward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params procedure :: get_params_ptr @@ -112,6 +113,12 @@ module function get_gradients(self) result(gradients) !! Gradients of this layer end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: dw_ptr(:,:) + real, pointer :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of this layer. !! The parameters are ordered as weights first, biases second. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index d0ac015a..a1ca6ce5 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -102,6 +102,15 @@ module function get_gradients(self) result(gradients) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: dw_ptr(:,:) + real, pointer :: db_ptr(:) + dw_ptr => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(dense_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index e7c39716..1d36c5e8 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) - real, pointer :: weights(:), biases(:), gradient(:) + real, pointer :: weights(:,:), biases(:), dw(:,:), db(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the @@ -702,7 +702,9 @@ module subroutine update(self, optimizer, batch_size) select type(this_layer => self % layers(n) % p) type is(dense_layer) call this_layer % get_params_ptr(weights, biases) - call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) !call this_layer % set_params(weights, biases) end select end do diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 1caf8c1e..400fbfa2 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -19,7 +19,9 @@ module nf_optimizers real :: learning_rate = 0.01 contains procedure(init), deferred :: init - procedure(minimize), deferred :: minimize + procedure(minimize_1d), deferred :: minimize_1d + procedure(minimize_2d), deferred :: minimize_2d + generic :: minimize => minimize_1d, minimize_2d end type optimizer_base_type abstract interface @@ -30,13 +32,19 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize(self, weights, biases, gradient) + pure subroutine minimize_1d(self, param, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) - end subroutine minimize + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) + end subroutine minimize_1d + + pure subroutine minimize_2d(self, param, gradient) + import :: optimizer_base_type + class(optimizer_base_type), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + end subroutine minimize_2d end interface @@ -47,7 +55,8 @@ end subroutine minimize real, allocatable, private :: velocity(:) contains procedure :: init => init_sgd - procedure :: minimize => minimize_sgd + procedure :: minimize_1d => minimize_sgd_1d + procedure :: minimize_2d => minimize_sgd_2d end type sgd type, extends(optimizer_base_type) :: rmsprop @@ -62,7 +71,8 @@ end subroutine minimize real, allocatable, private :: rms_gradient(:) contains procedure :: init => init_rmsprop - procedure :: minimize => minimize_rmsprop + procedure :: minimize_1d => minimize_rmsprop_1d + procedure :: minimize_2d => minimize_rmsprop_2d end type rmsprop type, extends(optimizer_base_type) :: adam @@ -85,7 +95,8 @@ end subroutine minimize integer, private :: t = 0 contains procedure :: init => init_adam - procedure :: minimize => minimize_adam + procedure :: minimize_1d => minimize_adam_1d + procedure :: minimize_2d => minimize_adam_2d end type adam type, extends(optimizer_base_type) :: adagrad @@ -102,7 +113,8 @@ end subroutine minimize integer, private :: t = 0 contains procedure :: init => init_adagrad - procedure :: minimize => minimize_adagrad + procedure :: minimize_1d => minimize_adagrad_1d + procedure :: minimize_2d => minimize_adagrad_2d end type adagrad contains @@ -117,13 +129,12 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd(self, weights, biases, gradient) + pure subroutine minimize_sgd_1d(self, param, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) if (self % momentum > 0) then ! Apply momentum update @@ -131,21 +142,17 @@ pure subroutine minimize_sgd(self, weights, biases, gradient) - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - weights = weights + self % momentum * self % velocity & - - self % learning_rate * gradient - biases = biases + self % momentum * self % velocity & + param = param + self % momentum * self % velocity & - self % learning_rate * gradient else - weights = weights + self % velocity - biases = biases + self % velocity + param = param + self % velocity end if else ! Apply regular update - weights = weights - self % learning_rate * gradient - biases = biases - self % learning_rate * gradient + param = param - self % learning_rate * gradient end if - end subroutine minimize_sgd + end subroutine minimize_sgd_1d impure elemental subroutine init_rmsprop(self, num_params) @@ -158,24 +165,21 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop(self, weights, biases, gradient) + pure subroutine minimize_rmsprop_1d(self, param, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) ! Compute the RMS of the gradient using the RMSProp rule self % rms_gradient = self % decay_rate * self % rms_gradient & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient - weights = weights - self % learning_rate & - / sqrt(self % rms_gradient + self % epsilon) * gradient - biases = biases - self % learning_rate & + param = param - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient - end subroutine minimize_rmsprop + end subroutine minimize_rmsprop_1d impure elemental subroutine init_adam(self, num_params) @@ -189,18 +193,17 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam(self, weights, biases, gradient) + pure subroutine minimize_adam_1d(self, param, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. - associate(g => gradient + self % weight_decay_l2 * weights) + associate(g => gradient + self % weight_decay_l2 * param) self % m = self % beta1 * self % m + (1 - self % beta1) * g self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 end associate @@ -212,19 +215,13 @@ pure subroutine minimize_adam(self, weights, biases, gradient) ) ! Update parameters. - weights = weights & + param = param & - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) & - + self % weight_decay_decoupled * weights) - - ! Update biases (without weight decay for biases) - associate(g => gradient) - biases = biases & - - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon)) - end associate + + self % weight_decay_decoupled * param) end associate - end subroutine minimize_adam + end subroutine minimize_adam_1d impure elemental subroutine init_adagrad(self, num_params) @@ -237,21 +234,19 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad(self, weights, biases, gradient) + pure subroutine minimize_adagrad_1d(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) ! Update the current time step self % t = self % t + 1 - ! For weights associate( & ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adagrad. - g => gradient + self % weight_decay_l2 * weights, & + g => gradient + self % weight_decay_l2 * param, & ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & @@ -259,21 +254,113 @@ pure subroutine minimize_adagrad(self, weights, biases, gradient) self % sum_squared_gradient = self % sum_squared_gradient + g**2 - weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) & + param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + self % epsilon) end associate - - ! For biases (without weight decay) + + end subroutine minimize_adagrad_1d + + + pure subroutine minimize_sgd_2d(self, param, gradient) + !! Concrete implementation of a stochastic gradient descent optimizer + !! update rule for 2D arrays. + class(sgd), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + if (self % momentum > 0) then + ! Apply momentum update + self % velocity = self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]) + if (self % nesterov) then + ! Apply Nesterov update + param = param + reshape(self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) + else + param = param + reshape(self % velocity, shape(param)) + end if + else + ! Apply regular update + param = param - self % learning_rate * gradient + end if + + end subroutine minimize_sgd_2d + + + pure subroutine minimize_rmsprop_2d(self, param, gradient) + !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. + class(rmsprop), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Compute the RMS of the gradient using the RMSProp rule + self % rms_gradient = self % decay_rate * self % rms_gradient & + + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 + + ! Update the network parameters based on the new RMS of the gradient + param = param - self % learning_rate & + / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient + + end subroutine minimize_rmsprop_2d + + + pure subroutine minimize_adam_2d(self, param, gradient) + !! Concrete implementation of an Adam optimizer update rule for 2D arrays. + class(adam), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + self % t = self % t + 1 + + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adam. + associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) + self % m = self % beta1 * self % m + (1 - self % beta1) * g + self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + end associate + + ! Compute bias-corrected first and second moment estimates. + associate( & + m_hat => self % m / (1 - self % beta1**self % t), & + v_hat => self % v / (1 - self % beta2**self % t) & + ) + + ! Update parameters. + param = param & + - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & + - self % learning_rate * self % weight_decay_decoupled * param + + end associate + + end subroutine minimize_adam_2d + + + pure subroutine minimize_adagrad_2d(self, param, gradient) + !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. + class(adagrad), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Update the current time step + self % t = self % t + 1 + associate( & - g => gradient, & + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adagrad. + g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), & + ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & ) - biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) & - + self % epsilon) + + self % sum_squared_gradient = self % sum_squared_gradient + g**2 + + param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) & + + self % epsilon), shape(param)) + end associate - end subroutine minimize_adagrad + end subroutine minimize_adagrad_2d end module nf_optimizers From 21c5707af2e7f0b7cbc816e9378848ea06c9a591 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 27 May 2025 13:57:48 -0400 Subject: [PATCH 3/6] Send the data to optimizer without a copy works for dense layers --- src/nf/nf_network_submodule.f90 | 12 +-- src/nf/nf_optimizers.f90 | 150 ++++++++++++++++---------------- 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 1d36c5e8..eccea580 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -694,10 +694,6 @@ module subroutine update(self, optimizer, batch_size) end do #endif - !params = self % get_params() - !call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - !call self % set_params(params) - do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) type is(dense_layer) @@ -705,11 +701,15 @@ module subroutine update(self, optimizer, batch_size) call this_layer % get_gradients_ptr(dw, db) call self % optimizer % minimize(weights, dw / batch_size_) call self % optimizer % minimize(biases, db / batch_size_) - !call this_layer % set_params(weights, biases) + type is(locally_connected1d_layer) + !TODO + type is(conv1d_layer) + !TODO + type is(conv2d_layer) + !TODO end select end do - ! Flush network gradients to zero. do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 400fbfa2..f6759d67 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -155,6 +155,32 @@ pure subroutine minimize_sgd_1d(self, param, gradient) end subroutine minimize_sgd_1d + pure subroutine minimize_sgd_2d(self, param, gradient) + !! Concrete implementation of a stochastic gradient descent optimizer + !! update rule for 2D arrays. + class(sgd), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + if (self % momentum > 0) then + ! Apply momentum update + self % velocity = self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]) + if (self % nesterov) then + ! Apply Nesterov update + param = param + reshape(self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) + else + param = param + reshape(self % velocity, shape(param)) + end if + else + ! Apply regular update + param = param - self % learning_rate * gradient + end if + + end subroutine minimize_sgd_2d + + impure elemental subroutine init_rmsprop(self, num_params) class(rmsprop), intent(inout) :: self integer, intent(in) :: num_params @@ -182,6 +208,23 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient) end subroutine minimize_rmsprop_1d + pure subroutine minimize_rmsprop_2d(self, param, gradient) + !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. + class(rmsprop), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Compute the RMS of the gradient using the RMSProp rule + self % rms_gradient = self % decay_rate * self % rms_gradient & + + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 + + ! Update the network parameters based on the new RMS of the gradient + param = param - self % learning_rate & + / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient + + end subroutine minimize_rmsprop_2d + + impure elemental subroutine init_adam(self, num_params) class(adam), intent(inout) :: self integer, intent(in) :: num_params @@ -224,6 +267,37 @@ pure subroutine minimize_adam_1d(self, param, gradient) end subroutine minimize_adam_1d + pure subroutine minimize_adam_2d(self, param, gradient) + !! Concrete implementation of an Adam optimizer update rule for 2D arrays. + class(adam), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + self % t = self % t + 1 + + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adam. + associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) + self % m = self % beta1 * self % m + (1 - self % beta1) * g + self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + end associate + + ! Compute bias-corrected first and second moment estimates. + associate( & + m_hat => self % m / (1 - self % beta1**self % t), & + v_hat => self % v / (1 - self % beta2**self % t) & + ) + + ! Update parameters. + param = param & + - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & + - self % learning_rate * self % weight_decay_decoupled * param + + end associate + + end subroutine minimize_adam_2d + + impure elemental subroutine init_adagrad(self, num_params) class(adagrad), intent(inout) :: self integer, intent(in) :: num_params @@ -262,80 +336,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient) end subroutine minimize_adagrad_1d - pure subroutine minimize_sgd_2d(self, param, gradient) - !! Concrete implementation of a stochastic gradient descent optimizer - !! update rule for 2D arrays. - class(sgd), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - if (self % momentum > 0) then - ! Apply momentum update - self % velocity = self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]) - if (self % nesterov) then - ! Apply Nesterov update - param = param + reshape(self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) - else - param = param + reshape(self % velocity, shape(param)) - end if - else - ! Apply regular update - param = param - self % learning_rate * gradient - end if - - end subroutine minimize_sgd_2d - - - pure subroutine minimize_rmsprop_2d(self, param, gradient) - !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. - class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & - + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 - - ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & - / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient - - end subroutine minimize_rmsprop_2d - - - pure subroutine minimize_adam_2d(self, param, gradient) - !! Concrete implementation of an Adam optimizer update rule for 2D arrays. - class(adam), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - self % t = self % t + 1 - - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adam. - associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 - end associate - - ! Compute bias-corrected first and second moment estimates. - associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & - ) - - ! Update parameters. - param = param & - - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & - - self % learning_rate * self % weight_decay_decoupled * param - - end associate - - end subroutine minimize_adam_2d - - pure subroutine minimize_adagrad_2d(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. class(adagrad), intent(inout) :: self @@ -363,4 +363,4 @@ pure subroutine minimize_adagrad_2d(self, param, gradient) end subroutine minimize_adagrad_2d -end module nf_optimizers +end module nf_optimizers \ No newline at end of file From 9d68828f7e29d66f435a6701996f1cb65f08416e Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 30 May 2025 13:47:28 -0400 Subject: [PATCH 4/6] Get weights and weight gradients as 1d --- src/nf/nf_dense_layer.f90 | 8 +- src/nf/nf_dense_layer_submodule.f90 | 12 +-- src/nf/nf_network_submodule.f90 | 2 +- src/nf/nf_optimizers.f90 | 145 +++------------------------- 4 files changed, 26 insertions(+), 141 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index ba6c33c4..a55ec892 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -100,8 +100,8 @@ end function get_params module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: w_ptr(:,:) - real, pointer :: b_ptr(:) + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) end subroutine get_params_ptr module function get_gradients(self) result(gradients) @@ -115,8 +115,8 @@ end function get_gradients module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: dw_ptr(:,:) - real, pointer :: db_ptr(:) + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a1ca6ce5..bb27c54a 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -79,9 +79,9 @@ end function get_params module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: w_ptr(:,:) - real, pointer :: b_ptr(:) - w_ptr => self % weights + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights b_ptr => self % biases end subroutine get_params_ptr @@ -104,9 +104,9 @@ end function get_gradients module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: dw_ptr(:,:) - real, pointer :: db_ptr(:) - dw_ptr => self % dw + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw db_ptr => self % db end subroutine get_gradients_ptr diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index eccea580..3508ec50 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) - real, pointer :: weights(:,:), biases(:), dw(:,:), db(:) + real, pointer :: weights(:), biases(:), dw(:), db(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index f6759d67..24089ccd 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -19,9 +19,7 @@ module nf_optimizers real :: learning_rate = 0.01 contains procedure(init), deferred :: init - procedure(minimize_1d), deferred :: minimize_1d - procedure(minimize_2d), deferred :: minimize_2d - generic :: minimize => minimize_1d, minimize_2d + procedure(minimize), deferred :: minimize end type optimizer_base_type abstract interface @@ -32,19 +30,12 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize_1d(self, param, gradient) + pure subroutine minimize(self, param, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) - end subroutine minimize_1d - - pure subroutine minimize_2d(self, param, gradient) - import :: optimizer_base_type - class(optimizer_base_type), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - end subroutine minimize_2d + end subroutine minimize end interface @@ -55,8 +46,7 @@ end subroutine minimize_2d real, allocatable, private :: velocity(:) contains procedure :: init => init_sgd - procedure :: minimize_1d => minimize_sgd_1d - procedure :: minimize_2d => minimize_sgd_2d + procedure :: minimize => minimize_sgd end type sgd type, extends(optimizer_base_type) :: rmsprop @@ -71,8 +61,7 @@ end subroutine minimize_2d real, allocatable, private :: rms_gradient(:) contains procedure :: init => init_rmsprop - procedure :: minimize_1d => minimize_rmsprop_1d - procedure :: minimize_2d => minimize_rmsprop_2d + procedure :: minimize => minimize_rmsprop end type rmsprop type, extends(optimizer_base_type) :: adam @@ -95,8 +84,7 @@ end subroutine minimize_2d integer, private :: t = 0 contains procedure :: init => init_adam - procedure :: minimize_1d => minimize_adam_1d - procedure :: minimize_2d => minimize_adam_2d + procedure :: minimize => minimize_adam end type adam type, extends(optimizer_base_type) :: adagrad @@ -113,8 +101,7 @@ end subroutine minimize_2d integer, private :: t = 0 contains procedure :: init => init_adagrad - procedure :: minimize_1d => minimize_adagrad_1d - procedure :: minimize_2d => minimize_adagrad_2d + procedure :: minimize => minimize_adagrad end type adagrad contains @@ -129,7 +116,7 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd_1d(self, param, gradient) + pure subroutine minimize_sgd(self, param, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self @@ -152,33 +139,7 @@ pure subroutine minimize_sgd_1d(self, param, gradient) param = param - self % learning_rate * gradient end if - end subroutine minimize_sgd_1d - - - pure subroutine minimize_sgd_2d(self, param, gradient) - !! Concrete implementation of a stochastic gradient descent optimizer - !! update rule for 2D arrays. - class(sgd), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - if (self % momentum > 0) then - ! Apply momentum update - self % velocity = self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]) - if (self % nesterov) then - ! Apply Nesterov update - param = param + reshape(self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) - else - param = param + reshape(self % velocity, shape(param)) - end if - else - ! Apply regular update - param = param - self % learning_rate * gradient - end if - - end subroutine minimize_sgd_2d + end subroutine minimize_sgd impure elemental subroutine init_rmsprop(self, num_params) @@ -191,7 +152,7 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop_1d(self, param, gradient) + pure subroutine minimize_rmsprop(self, param, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self real, intent(inout) :: param(:) @@ -205,24 +166,7 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient) param = param - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient - end subroutine minimize_rmsprop_1d - - - pure subroutine minimize_rmsprop_2d(self, param, gradient) - !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. - class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & - + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 - - ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & - / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient - - end subroutine minimize_rmsprop_2d + end subroutine minimize_rmsprop impure elemental subroutine init_adam(self, num_params) @@ -236,7 +180,7 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam_1d(self, param, gradient) + pure subroutine minimize_adam(self, param, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self real, intent(inout) :: param(:) @@ -264,38 +208,7 @@ pure subroutine minimize_adam_1d(self, param, gradient) end associate - end subroutine minimize_adam_1d - - - pure subroutine minimize_adam_2d(self, param, gradient) - !! Concrete implementation of an Adam optimizer update rule for 2D arrays. - class(adam), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - self % t = self % t + 1 - - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adam. - associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 - end associate - - ! Compute bias-corrected first and second moment estimates. - associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & - ) - - ! Update parameters. - param = param & - - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & - - self % learning_rate * self % weight_decay_decoupled * param - - end associate - - end subroutine minimize_adam_2d + end subroutine minimize_adam impure elemental subroutine init_adagrad(self, num_params) @@ -308,7 +221,7 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad_1d(self, param, gradient) + pure subroutine minimize_adagrad(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self real, intent(inout) :: param(:) @@ -333,34 +246,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient) end associate - end subroutine minimize_adagrad_1d - - - pure subroutine minimize_adagrad_2d(self, param, gradient) - !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. - class(adagrad), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Update the current time step - self % t = self % t + 1 - - associate( & - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adagrad. - g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), & - ! Amortize the learning rate as function of the current time step. - learning_rate => self % learning_rate & - / (1 + (self % t - 1) * self % learning_rate_decay) & - ) - - self % sum_squared_gradient = self % sum_squared_gradient + g**2 - - param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) & - + self % epsilon), shape(param)) - - end associate - - end subroutine minimize_adagrad_2d + end subroutine minimize_adagrad end module nf_optimizers \ No newline at end of file From 2160f97f8a6ffac1b62f6f25e38b752c4ba2d65b Mon Sep 17 00:00:00 2001 From: milancurcic Date: Thu, 19 Jun 2025 23:49:05 -0400 Subject: [PATCH 5/6] get_params_ptr and get_gradients_ptr for conv1d, conv2d, and locally_connected1d --- src/nf/nf_conv1d_layer.f90 | 22 ++++++++++++++ src/nf/nf_conv1d_layer_submodule.f90 | 16 ++++++++++ src/nf/nf_conv2d_layer.f90 | 22 ++++++++++++++ src/nf/nf_conv2d_layer_submodule.f90 | 18 ++++++++++++ src/nf/nf_locally_connected1d_layer.f90 | 14 +++++++++ ...nf_locally_connected1d_layer_submodule.f90 | 16 ++++++++++ src/nf/nf_network_submodule.f90 | 29 +++++++++---------- 7 files changed, 122 insertions(+), 15 deletions(-) diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 index c39b11fc..871eef02 100644 --- a/src/nf/nf_conv1d_layer.f90 +++ b/src/nf/nf_conv1d_layer.f90 @@ -32,8 +32,10 @@ module nf_conv1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,6 +99,16 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -106,6 +118,16 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(conv1d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 index 5404b9c7..05bcde70 100644 --- a/src/nf/nf_conv1d_layer_submodule.f90 +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -152,6 +152,14 @@ module function get_params(self) result(params) params = [ w_, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(conv1d_layer), intent(in), target :: self real, allocatable :: gradients(:) @@ -160,6 +168,14 @@ module function get_gradients(self) result(gradients) gradients = [ dw_, self % db ] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(conv1d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 4b79376e..3f7b28db 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -33,8 +33,10 @@ module nf_conv2d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -98,6 +100,16 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -107,6 +119,16 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(conv2d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 45a2c1da..b617ec34 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -204,6 +204,15 @@ module function get_params(self) result(params) end function get_params + + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(conv2d_layer), intent(in), target :: self @@ -221,6 +230,15 @@ module function get_gradients(self) result(gradients) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(conv2d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90 index beca76d5..6fea2c5c 100644 --- a/src/nf/nf_locally_connected1d_layer.f90 +++ b/src/nf/nf_locally_connected1d_layer.f90 @@ -32,8 +32,10 @@ module nf_locally_connected1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,6 +99,12 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -106,6 +114,12 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(locally_connected1d_layer), intent(in out) :: self diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90 index 053c520b..fa6110d5 100644 --- a/src/nf/nf_locally_connected1d_layer_submodule.f90 +++ b/src/nf/nf_locally_connected1d_layer_submodule.f90 @@ -128,12 +128,28 @@ module function get_params(self) result(params) params = [self % kernel, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr(1:size(self % biases)) => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(locally_connected1d_layer), intent(in), target :: self real, allocatable :: gradients(:) gradients = [self % dw, self % db] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr(1:size(self % db)) => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(locally_connected1d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 3508ec50..60c0e151 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -701,28 +701,27 @@ module subroutine update(self, optimizer, batch_size) call this_layer % get_gradients_ptr(dw, db) call self % optimizer % minimize(weights, dw / batch_size_) call self % optimizer % minimize(biases, db / batch_size_) - type is(locally_connected1d_layer) - !TODO - type is(conv1d_layer) - !TODO - type is(conv2d_layer) - !TODO - end select - end do - - ! Flush network gradients to zero. - do n = 2, size(self % layers) - select type(this_layer => self % layers(n) % p) - type is(dense_layer) this_layer % dw = 0 this_layer % db = 0 - type is(conv2d_layer) + type is(conv1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(conv1d_layer) + type is(conv2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 end select From 0e11f1016828f229dbb5d1f50d7c573ff9a9c918 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 20 Jun 2025 13:59:22 -0400 Subject: [PATCH 6/6] Define optimizer instance per layer to preserve memory across layers --- src/nf/nf_layer.f90 | 1 + src/nf/nf_network_submodule.f90 | 46 +++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 517622b0..b12592f3 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -22,6 +22,7 @@ module nf_layer integer, allocatable :: layer_shape(:) integer, allocatable :: input_layer_shape(:) logical :: initialized = .false. + class(optimizer_base_type), allocatable :: optimizer contains diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 60c0e151..876070bc 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -597,12 +597,26 @@ module subroutine train(self, input_data, output_data, batch_size, & ! If not provided, we default to SGD with its default settings. if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if call self % optimizer % init(self % get_num_params()) + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do + ! Passing the loss instance is optional. ! If not provided, we default to quadratic(). if (present(loss)) then @@ -662,10 +676,26 @@ module subroutine update(self, optimizer, batch_size) if (.not. allocated(self % optimizer)) then if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if + call self % optimizer % init(self % get_num_params()) + + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do + end if if (present(batch_size)) then @@ -699,29 +729,29 @@ module subroutine update(self, optimizer, batch_size) type is(dense_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv2d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 end select