From bf1478e538bc52051b3e3394de24f9be75820ce8 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 23 May 2025 09:52:14 -0400
Subject: [PATCH 1/6] WIP optimizer refactor w/ pointers

---
 src/nf/nf_dense_layer.f90           |  7 +++
 src/nf/nf_dense_layer_submodule.f90 |  9 ++++
 src/nf/nf_network_submodule.f90     | 17 +++++--
 src/nf/nf_optimizers.f90            | 76 ++++++++++++++++++++---------
 4 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index 862f4cdf..462434f6 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -36,6 +36,7 @@ module nf_dense_layer
     procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: init
     procedure :: set_params
 
@@ -96,6 +97,12 @@ module function get_params(self) result(params)
         !! Parameters of this layer
     end function get_params
 
+    module subroutine get_params_ptr(self, w_ptr, b_ptr)
+      class(dense_layer), intent(in), target :: self
+      real, pointer :: w_ptr(:,:)
+      real, pointer :: b_ptr(:)
+    end subroutine get_params_ptr
+
     module function get_gradients(self) result(gradients)
       !! Return the gradients of this layer.
       !! The gradients are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index a424cf9c..d0ac015a 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -77,6 +77,15 @@ module function get_params(self) result(params)
   end function get_params
 
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(dense_layer), intent(in), target :: self
+    real, pointer :: w_ptr(:,:)
+    real, pointer :: b_ptr(:)
+    w_ptr => self % weights
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
+
   module function get_gradients(self) result(gradients)
     class(dense_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index d8f5ff50..e7c39716 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,6 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
+    real, pointer :: weights(:), biases(:), gradient(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
@@ -693,9 +694,19 @@ module subroutine update(self, optimizer, batch_size)
     end do
 #endif
 
-    params = self % get_params()
-    call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
-    call self % set_params(params)
+    !params = self % get_params()
+    !call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
+    !call self % set_params(params)
+
+    do n = 2, size(self % layers)
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_)
+          !call this_layer % set_params(weights, biases)
+      end select
+    end do
+
 
     ! Flush network gradients to zero.
     do n = 2, size(self % layers)
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index c64cefed..1caf8c1e 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -30,11 +30,12 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize(self, param, gradient)
+    pure subroutine minimize(self, weights, biases, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout) :: param(:)
-      real, intent(in) :: gradient(:)
+      real, intent(inout), pointer :: weights(:)
+      real, intent(inout), pointer :: biases(:)
+      real, intent(in), pointer :: gradient(:)
     end subroutine minimize
 
   end interface
@@ -116,12 +117,13 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd(self, param, gradient)
+  pure subroutine minimize_sgd(self, weights, biases, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     if (self % momentum > 0) then
       ! Apply momentum update
@@ -129,14 +131,18 @@ pure subroutine minimize_sgd(self, param, gradient)
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        param = param + self % momentum * self % velocity &
+        weights = weights + self % momentum * self % velocity &
+          - self % learning_rate * gradient
+        biases = biases + self % momentum * self % velocity &
           - self % learning_rate * gradient
       else
-        param = param + self % velocity
+        weights = weights + self % velocity
+        biases = biases + self % velocity
       end if
     else
       ! Apply regular update
-      param = param - self % learning_rate * gradient
+      weights = weights - self % learning_rate * gradient
+      biases = biases - self % learning_rate * gradient
     end if
 
   end subroutine minimize_sgd
@@ -152,18 +158,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop(self, param, gradient)
+  pure subroutine minimize_rmsprop(self, weights, biases, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     ! Compute the RMS of the gradient using the RMSProp rule
     self % rms_gradient = self % decay_rate * self % rms_gradient &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
+    weights = weights - self % learning_rate &
+      / sqrt(self % rms_gradient + self % epsilon) * gradient
+    biases = biases - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
   end subroutine minimize_rmsprop
@@ -180,17 +189,18 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam(self, param, gradient)
+  pure subroutine minimize_adam(self, weights, biases, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
-    associate(g => gradient + self % weight_decay_l2 * param)
+    associate(g => gradient + self % weight_decay_l2 * weights)
       self % m = self % beta1 * self % m + (1 - self % beta1) * g
       self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
     end associate
@@ -202,9 +212,15 @@ pure subroutine minimize_adam(self, param, gradient)
     )
 
     ! Update parameters.
-    param = param &
+    weights = weights &
       - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
-      + self % weight_decay_decoupled * param)
+      + self % weight_decay_decoupled * weights)
+    
+    ! Update biases (without weight decay for biases)
+    associate(g => gradient)
+      biases = biases &
+        - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon))
+    end associate
 
     end associate
 
@@ -221,19 +237,21 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad(self, param, gradient)
+  pure subroutine minimize_adagrad(self, weights, biases, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     ! Update the current time step
     self % t = self % t + 1
 
+    ! For weights
     associate( &
       ! If weight_decay_l2 > 0, use L2 regularization;
       ! otherwise, default to regular Adagrad.
-      g => gradient + self % weight_decay_l2 * param, &
+      g => gradient + self % weight_decay_l2 * weights, &
       ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
@@ -241,10 +259,20 @@ pure subroutine minimize_adagrad(self, param, gradient)
 
       self % sum_squared_gradient = self % sum_squared_gradient + g**2
 
-      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) &
         + self % epsilon)
 
     end associate
+    
+    ! For biases (without weight decay)
+    associate( &
+      g => gradient, &
+      learning_rate => self % learning_rate &
+        / (1 + (self % t - 1) * self % learning_rate_decay) &
+    )
+      biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon)
+    end associate
 
   end subroutine minimize_adagrad
 

From 38896cc57abc017987f8b46b9650cb0ec3151545 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 27 May 2025 11:53:57 -0400
Subject: [PATCH 2/6] WIP optimizer optimization

---
 src/nf/nf_dense_layer.f90           |   7 +
 src/nf/nf_dense_layer_submodule.f90 |   9 ++
 src/nf/nf_network_submodule.f90     |   6 +-
 src/nf/nf_optimizers.f90            | 201 ++++++++++++++++++++--------
 4 files changed, 164 insertions(+), 59 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index 462434f6..ba6c33c4 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -34,6 +34,7 @@ module nf_dense_layer
     procedure :: backward
     procedure :: forward
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
     procedure :: get_params_ptr
@@ -112,6 +113,12 @@ module function get_gradients(self) result(gradients)
         !! Gradients of this layer
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      class(dense_layer), intent(in), target :: self
+      real, pointer :: dw_ptr(:,:)
+      real, pointer :: db_ptr(:)
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       !! Set the parameters of this layer.
       !! The parameters are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index d0ac015a..a1ca6ce5 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -102,6 +102,15 @@ module function get_gradients(self) result(gradients)
   end function get_gradients
 
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(dense_layer), intent(in), target :: self
+    real, pointer :: dw_ptr(:,:)
+    real, pointer :: db_ptr(:)
+    dw_ptr => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
+
   module subroutine set_params(self, params)
     class(dense_layer), intent(in out) :: self
     real, intent(in), target :: params(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index e7c39716..1d36c5e8 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:), biases(:), gradient(:)
+    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
@@ -702,7 +702,9 @@ module subroutine update(self, optimizer, batch_size)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
-          call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           !call this_layer % set_params(weights, biases)
       end select
     end do
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 1caf8c1e..400fbfa2 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -19,7 +19,9 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize), deferred :: minimize
+    procedure(minimize_1d), deferred :: minimize_1d
+    procedure(minimize_2d), deferred :: minimize_2d
+    generic :: minimize => minimize_1d, minimize_2d
   end type optimizer_base_type
 
   abstract interface
@@ -30,13 +32,19 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize(self, weights, biases, gradient)
+    pure subroutine minimize_1d(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout), pointer :: weights(:)
-      real, intent(inout), pointer :: biases(:)
-      real, intent(in), pointer :: gradient(:)
-    end subroutine minimize
+      real, intent(inout) :: param(:)
+      real, intent(in) :: gradient(:)
+    end subroutine minimize_1d
+
+    pure subroutine minimize_2d(self, param, gradient)
+      import :: optimizer_base_type
+      class(optimizer_base_type), intent(inout) :: self
+      real, intent(inout) :: param(:,:)
+      real, intent(in) :: gradient(:,:)
+    end subroutine minimize_2d
 
   end interface
 
@@ -47,7 +55,8 @@ end subroutine minimize
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize => minimize_sgd
+    procedure :: minimize_1d => minimize_sgd_1d
+    procedure :: minimize_2d => minimize_sgd_2d
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -62,7 +71,8 @@ end subroutine minimize
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize => minimize_rmsprop
+    procedure :: minimize_1d => minimize_rmsprop_1d
+    procedure :: minimize_2d => minimize_rmsprop_2d
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -85,7 +95,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize => minimize_adam
+    procedure :: minimize_1d => minimize_adam_1d
+    procedure :: minimize_2d => minimize_adam_2d
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -102,7 +113,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize => minimize_adagrad
+    procedure :: minimize_1d => minimize_adagrad_1d
+    procedure :: minimize_2d => minimize_adagrad_2d
   end type adagrad
 
 contains
@@ -117,13 +129,12 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd(self, weights, biases, gradient)
+  pure subroutine minimize_sgd_1d(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     if (self % momentum > 0) then
       ! Apply momentum update
@@ -131,21 +142,17 @@ pure subroutine minimize_sgd(self, weights, biases, gradient)
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        weights = weights + self % momentum * self % velocity &
-          - self % learning_rate * gradient
-        biases = biases + self % momentum * self % velocity &
+        param = param + self % momentum * self % velocity &
           - self % learning_rate * gradient
       else
-        weights = weights + self % velocity
-        biases = biases + self % velocity
+        param = param + self % velocity
       end if
     else
       ! Apply regular update
-      weights = weights - self % learning_rate * gradient
-      biases = biases - self % learning_rate * gradient
+      param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd
+  end subroutine minimize_sgd_1d
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -158,24 +165,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop(self, weights, biases, gradient)
+  pure subroutine minimize_rmsprop_1d(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Compute the RMS of the gradient using the RMSProp rule
     self % rms_gradient = self % decay_rate * self % rms_gradient &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
-    weights = weights - self % learning_rate &
-      / sqrt(self % rms_gradient + self % epsilon) * gradient
-    biases = biases - self % learning_rate &
+    param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop
+  end subroutine minimize_rmsprop_1d
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -189,18 +193,17 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam(self, weights, biases, gradient)
+  pure subroutine minimize_adam_1d(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
-    associate(g => gradient + self % weight_decay_l2 * weights)
+    associate(g => gradient + self % weight_decay_l2 * param)
       self % m = self % beta1 * self % m + (1 - self % beta1) * g
       self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
     end associate
@@ -212,19 +215,13 @@ pure subroutine minimize_adam(self, weights, biases, gradient)
     )
 
     ! Update parameters.
-    weights = weights &
+    param = param &
       - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
-      + self % weight_decay_decoupled * weights)
-    
-    ! Update biases (without weight decay for biases)
-    associate(g => gradient)
-      biases = biases &
-        - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon))
-    end associate
+      + self % weight_decay_decoupled * param)
 
     end associate
 
-  end subroutine minimize_adam
+  end subroutine minimize_adam_1d
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -237,21 +234,19 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad(self, weights, biases, gradient)
+  pure subroutine minimize_adagrad_1d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Update the current time step
     self % t = self % t + 1
 
-    ! For weights
     associate( &
       ! If weight_decay_l2 > 0, use L2 regularization;
       ! otherwise, default to regular Adagrad.
-      g => gradient + self % weight_decay_l2 * weights, &
+      g => gradient + self % weight_decay_l2 * param, &
       ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
@@ -259,21 +254,113 @@ pure subroutine minimize_adagrad(self, weights, biases, gradient)
 
       self % sum_squared_gradient = self % sum_squared_gradient + g**2
 
-      weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
         + self % epsilon)
 
     end associate
-    
-    ! For biases (without weight decay)
+
+  end subroutine minimize_adagrad_1d
+
+
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
+  pure subroutine minimize_adagrad_2d(self, param, gradient)
+    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
+    class(adagrad), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Update the current time step
+    self % t = self % t + 1
+
     associate( &
-      g => gradient, &
+      ! If weight_decay_l2 > 0, use L2 regularization;
+      ! otherwise, default to regular Adagrad.
+      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
+      ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
-      biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon)
+
+      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+
+      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon), shape(param))
+
     end associate
 
-  end subroutine minimize_adagrad
+  end subroutine minimize_adagrad_2d
 
 end module nf_optimizers

From 21c5707af2e7f0b7cbc816e9378848ea06c9a591 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 27 May 2025 13:57:48 -0400
Subject: [PATCH 3/6] Send the data to optimizer without a copy works for dense
 layers

---
 src/nf/nf_network_submodule.f90 |  12 +--
 src/nf/nf_optimizers.f90        | 150 ++++++++++++++++----------------
 2 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 1d36c5e8..eccea580 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -694,10 +694,6 @@ module subroutine update(self, optimizer, batch_size)
     end do
 #endif
 
-    !params = self % get_params()
-    !call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
-    !call self % set_params(params)
-
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
@@ -705,11 +701,15 @@ module subroutine update(self, optimizer, batch_size)
           call this_layer % get_gradients_ptr(dw, db)
           call self % optimizer % minimize(weights, dw / batch_size_)
           call self % optimizer % minimize(biases, db / batch_size_)
-          !call this_layer % set_params(weights, biases)
+        type is(locally_connected1d_layer)
+          !TODO
+        type is(conv1d_layer)
+          !TODO
+        type is(conv2d_layer)
+          !TODO
       end select
     end do
 
-
     ! Flush network gradients to zero.
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 400fbfa2..f6759d67 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -155,6 +155,32 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
   end subroutine minimize_sgd_1d
 
 
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
   impure elemental subroutine init_rmsprop(self, num_params)
     class(rmsprop), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -182,6 +208,23 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
   end subroutine minimize_rmsprop_1d
 
 
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
   impure elemental subroutine init_adam(self, num_params)
     class(adam), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -224,6 +267,37 @@ pure subroutine minimize_adam_1d(self, param, gradient)
   end subroutine minimize_adam_1d
 
 
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
   impure elemental subroutine init_adagrad(self, num_params)
     class(adagrad), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -262,80 +336,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
   end subroutine minimize_adagrad_1d
 
 
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
-
-
   pure subroutine minimize_adagrad_2d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
     class(adagrad), intent(inout) :: self
@@ -363,4 +363,4 @@ pure subroutine minimize_adagrad_2d(self, param, gradient)
 
   end subroutine minimize_adagrad_2d
 
-end module nf_optimizers
+end module nf_optimizers
\ No newline at end of file

From 9d68828f7e29d66f435a6701996f1cb65f08416e Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 30 May 2025 13:47:28 -0400
Subject: [PATCH 4/6] Get weights and weight gradients as 1d

---
 src/nf/nf_dense_layer.f90           |   8 +-
 src/nf/nf_dense_layer_submodule.f90 |  12 +--
 src/nf/nf_network_submodule.f90     |   2 +-
 src/nf/nf_optimizers.f90            | 145 +++-------------------------
 4 files changed, 26 insertions(+), 141 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index ba6c33c4..a55ec892 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -100,8 +100,8 @@ end function get_params
 
     module subroutine get_params_ptr(self, w_ptr, b_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: w_ptr(:,:)
-      real, pointer :: b_ptr(:)
+      real, pointer, intent(out) :: w_ptr(:)
+      real, pointer, intent(out) :: b_ptr(:)
     end subroutine get_params_ptr
 
     module function get_gradients(self) result(gradients)
@@ -115,8 +115,8 @@ end function get_gradients
 
     module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: dw_ptr(:,:)
-      real, pointer :: db_ptr(:)
+      real, pointer, intent(out) :: dw_ptr(:)
+      real, pointer, intent(out) :: db_ptr(:)
     end subroutine get_gradients_ptr
 
     module subroutine set_params(self, params)
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index a1ca6ce5..bb27c54a 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -79,9 +79,9 @@ end function get_params
 
   module subroutine get_params_ptr(self, w_ptr, b_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: w_ptr(:,:)
-    real, pointer :: b_ptr(:)
-    w_ptr => self % weights
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % weights)) => self % weights
     b_ptr => self % biases
   end subroutine get_params_ptr
 
@@ -104,9 +104,9 @@ end function get_gradients
 
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: dw_ptr(:,:)
-    real, pointer :: db_ptr(:)
-    dw_ptr => self % dw
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
     db_ptr => self % db
   end subroutine get_gradients_ptr
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index eccea580..3508ec50 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
+    real, pointer :: weights(:), biases(:), dw(:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index f6759d67..24089ccd 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -19,9 +19,7 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize_1d), deferred :: minimize_1d
-    procedure(minimize_2d), deferred :: minimize_2d
-    generic :: minimize => minimize_1d, minimize_2d
+    procedure(minimize), deferred :: minimize
   end type optimizer_base_type
 
   abstract interface
@@ -32,19 +30,12 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize_1d(self, param, gradient)
+    pure subroutine minimize(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
       real, intent(inout) :: param(:)
       real, intent(in) :: gradient(:)
-    end subroutine minimize_1d
-
-    pure subroutine minimize_2d(self, param, gradient)
-      import :: optimizer_base_type
-      class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout) :: param(:,:)
-      real, intent(in) :: gradient(:,:)
-    end subroutine minimize_2d
+    end subroutine minimize
 
   end interface
 
@@ -55,8 +46,7 @@ end subroutine minimize_2d
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize_1d => minimize_sgd_1d
-    procedure :: minimize_2d => minimize_sgd_2d
+    procedure :: minimize => minimize_sgd
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -71,8 +61,7 @@ end subroutine minimize_2d
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize_1d => minimize_rmsprop_1d
-    procedure :: minimize_2d => minimize_rmsprop_2d
+    procedure :: minimize => minimize_rmsprop
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -95,8 +84,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize_1d => minimize_adam_1d
-    procedure :: minimize_2d => minimize_adam_2d
+    procedure :: minimize => minimize_adam
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -113,8 +101,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize_1d => minimize_adagrad_1d
-    procedure :: minimize_2d => minimize_adagrad_2d
+    procedure :: minimize => minimize_adagrad
   end type adagrad
 
 contains
@@ -129,7 +116,7 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd_1d(self, param, gradient)
+  pure subroutine minimize_sgd(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
@@ -152,33 +139,7 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
       param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd_1d
-
-
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
+  end subroutine minimize_sgd
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -191,7 +152,7 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop_1d(self, param, gradient)
+  pure subroutine minimize_rmsprop(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -205,24 +166,7 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
     param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop_1d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
+  end subroutine minimize_rmsprop
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -236,7 +180,7 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam_1d(self, param, gradient)
+  pure subroutine minimize_adam(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -264,38 +208,7 @@ pure subroutine minimize_adam_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adam_1d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
+  end subroutine minimize_adam
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -308,7 +221,7 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad_1d(self, param, gradient)
+  pure subroutine minimize_adagrad(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -333,34 +246,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adagrad_1d
-
-
-  pure subroutine minimize_adagrad_2d(self, param, gradient)
-    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
-    class(adagrad), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Update the current time step
-    self % t = self % t + 1
-
-    associate( &
-      ! If weight_decay_l2 > 0, use L2 regularization;
-      ! otherwise, default to regular Adagrad.
-      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
-      ! Amortize the learning rate as function of the current time step.
-      learning_rate => self % learning_rate &
-        / (1 + (self % t - 1) * self % learning_rate_decay) &
-    )
-
-      self % sum_squared_gradient = self % sum_squared_gradient + g**2
-
-      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon), shape(param))
-
-    end associate
-
-  end subroutine minimize_adagrad_2d
+  end subroutine minimize_adagrad
 
 end module nf_optimizers
\ No newline at end of file

From 2160f97f8a6ffac1b62f6f25e38b752c4ba2d65b Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Thu, 19 Jun 2025 23:49:05 -0400
Subject: [PATCH 5/6] get_params_ptr and get_gradients_ptr for conv1d, conv2d,
 and locally_connected1d

---
 src/nf/nf_conv1d_layer.f90                    | 22 ++++++++++++++
 src/nf/nf_conv1d_layer_submodule.f90          | 16 ++++++++++
 src/nf/nf_conv2d_layer.f90                    | 22 ++++++++++++++
 src/nf/nf_conv2d_layer_submodule.f90          | 18 ++++++++++++
 src/nf/nf_locally_connected1d_layer.f90       | 14 +++++++++
 ...nf_locally_connected1d_layer_submodule.f90 | 16 ++++++++++
 src/nf/nf_network_submodule.f90               | 29 +++++++++----------
 7 files changed, 122 insertions(+), 15 deletions(-)

diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90
index c39b11fc..871eef02 100644
--- a/src/nf/nf_conv1d_layer.f90
+++ b/src/nf/nf_conv1d_layer.f90
@@ -32,8 +32,10 @@ module nf_conv1d_layer
       procedure :: forward
       procedure :: backward
       procedure :: get_gradients
+      procedure :: get_gradients_ptr
       procedure :: get_num_params
       procedure :: get_params
+      procedure :: get_params_ptr
       procedure :: init
       procedure :: set_params
   
@@ -97,6 +99,16 @@ module function get_params(self) result(params)
           !! Parameters to get
       end function get_params
   
+      module subroutine get_params_ptr(self, w_ptr, b_ptr)
+        !! Return pointers to the parameters (weights and biases) of this layer.
+        class(conv1d_layer), intent(in), target :: self
+          !! A `conv1d_layer` instance
+        real, pointer, intent(out) :: w_ptr(:)
+          !! Pointer to the kernel weights (flattened)
+        real, pointer, intent(out) :: b_ptr(:)
+          !! Pointer to the biases
+      end subroutine get_params_ptr
+
       module function get_gradients(self) result(gradients)
         !! Return the gradients of this layer.
         !! The gradients are ordered as weights first, biases second.
@@ -106,6 +118,16 @@ module function get_gradients(self) result(gradients)
           !! Gradients to get
       end function get_gradients
   
+      module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+        !! Return pointers to the gradients of this layer.
+        class(conv1d_layer), intent(in), target :: self
+          !! A `conv1d_layer` instance
+        real, pointer, intent(out) :: dw_ptr(:)
+          !! Pointer to the kernel weight gradients (flattened)
+        real, pointer, intent(out) :: db_ptr(:)
+          !! Pointer to the bias gradients
+      end subroutine get_gradients_ptr
+  
       module subroutine set_params(self, params)
         !! Set the parameters of the layer.
         class(conv1d_layer), intent(in out) :: self
diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90
index 5404b9c7..05bcde70 100644
--- a/src/nf/nf_conv1d_layer_submodule.f90
+++ b/src/nf/nf_conv1d_layer_submodule.f90
@@ -152,6 +152,14 @@ module function get_params(self) result(params)
     params = [ w_, self % biases]
   end function get_params
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(conv1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
   module function get_gradients(self) result(gradients)
     class(conv1d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
@@ -160,6 +168,14 @@ module function get_gradients(self) result(gradients)
     gradients = [ dw_, self % db ]
   end function get_gradients
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(conv1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
   module subroutine set_params(self, params)
     class(conv1d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
index 4b79376e..3f7b28db 100644
--- a/src/nf/nf_conv2d_layer.f90
+++ b/src/nf/nf_conv2d_layer.f90
@@ -33,8 +33,10 @@ module nf_conv2d_layer
     procedure :: forward
     procedure :: backward
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: init
     procedure :: set_params
 
@@ -98,6 +100,16 @@ module function get_params(self) result(params)
         !! Parameters to get
     end function get_params
 
+    module subroutine get_params_ptr(self, w_ptr, b_ptr)
+      !! Return pointers to the parameters (weights and biases) of this layer.
+      class(conv2d_layer), intent(in), target :: self
+        !! A `conv2d_layer` instance
+      real, pointer, intent(out) :: w_ptr(:)
+        !! Pointer to the kernel weights (flattened)
+      real, pointer, intent(out) :: b_ptr(:)
+        !! Pointer to the biases
+    end subroutine get_params_ptr
+
     module function get_gradients(self) result(gradients)
       !! Return the gradients of this layer.
       !! The gradients are ordered as weights first, biases second.
@@ -107,6 +119,16 @@ module function get_gradients(self) result(gradients)
         !! Gradients to get
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      !! Return pointers to the gradients of this layer.
+      class(conv2d_layer), intent(in), target :: self
+        !! A `conv2d_layer` instance
+      real, pointer, intent(out) :: dw_ptr(:)
+        !! Pointer to the kernel weight gradients (flattened)
+      real, pointer, intent(out) :: db_ptr(:)
+        !! Pointer to the bias gradients
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       !! Set the parameters of the layer.
       class(conv2d_layer), intent(in out) :: self
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index 45a2c1da..b617ec34 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -204,6 +204,15 @@ module function get_params(self) result(params)
 
   end function get_params
 
+  
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(conv2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
 
   module function get_gradients(self) result(gradients)
     class(conv2d_layer), intent(in), target :: self
@@ -221,6 +230,15 @@ module function get_gradients(self) result(gradients)
   end function get_gradients
 
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(conv2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
+
   module subroutine set_params(self, params)
     class(conv2d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90
index beca76d5..6fea2c5c 100644
--- a/src/nf/nf_locally_connected1d_layer.f90
+++ b/src/nf/nf_locally_connected1d_layer.f90
@@ -32,8 +32,10 @@ module nf_locally_connected1d_layer
       procedure :: forward
       procedure :: backward
       procedure :: get_gradients
+      procedure :: get_gradients_ptr
       procedure :: get_num_params
       procedure :: get_params
+      procedure :: get_params_ptr
       procedure :: init
       procedure :: set_params
   
@@ -97,6 +99,12 @@ module function get_params(self) result(params)
           !! Parameters to get
       end function get_params
   
+      module subroutine get_params_ptr(self, w_ptr, b_ptr)
+        class(locally_connected1d_layer), intent(in), target :: self
+        real, pointer, intent(out) :: w_ptr(:)
+        real, pointer, intent(out) :: b_ptr(:)
+      end subroutine get_params_ptr
+  
       module function get_gradients(self) result(gradients)
         !! Return the gradients of this layer.
         !! The gradients are ordered as weights first, biases second.
@@ -106,6 +114,12 @@ module function get_gradients(self) result(gradients)
           !! Gradients to get
       end function get_gradients
   
+      module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+        class(locally_connected1d_layer), intent(in), target :: self
+        real, pointer, intent(out) :: dw_ptr(:)
+        real, pointer, intent(out) :: db_ptr(:)
+      end subroutine get_gradients_ptr
+  
       module subroutine set_params(self, params)
         !! Set the parameters of the layer.
         class(locally_connected1d_layer), intent(in out) :: self
diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90
index 053c520b..fa6110d5 100644
--- a/src/nf/nf_locally_connected1d_layer_submodule.f90
+++ b/src/nf/nf_locally_connected1d_layer_submodule.f90
@@ -128,12 +128,28 @@ module function get_params(self) result(params)
     params = [self % kernel, self % biases]
   end function get_params
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(locally_connected1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr(1:size(self % biases)) => self % biases
+  end subroutine get_params_ptr
+
   module function get_gradients(self) result(gradients)
     class(locally_connected1d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
     gradients = [self % dw, self % db]
   end function get_gradients
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(locally_connected1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr(1:size(self % db)) => self % db
+  end subroutine get_gradients_ptr
+
   module subroutine set_params(self, params)
     class(locally_connected1d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 3508ec50..60c0e151 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -701,28 +701,27 @@ module subroutine update(self, optimizer, batch_size)
           call this_layer % get_gradients_ptr(dw, db)
           call self % optimizer % minimize(weights, dw / batch_size_)
           call self % optimizer % minimize(biases, db / batch_size_)
-        type is(locally_connected1d_layer)
-          !TODO
-        type is(conv1d_layer)
-          !TODO
-        type is(conv2d_layer)
-          !TODO
-      end select
-    end do
-
-    ! Flush network gradients to zero.
-    do n = 2, size(self % layers)
-      select type(this_layer => self % layers(n) % p)
-        type is(dense_layer)
           this_layer % dw = 0
           this_layer % db = 0
-        type is(conv2d_layer)
+        type is(conv1d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
-        type is(conv1d_layer)
+        type is(conv2d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(locally_connected1d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
       end select

From 0e11f1016828f229dbb5d1f50d7c573ff9a9c918 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 20 Jun 2025 13:59:22 -0400
Subject: [PATCH 6/6] Define optimizer instance per layer to preserve memory
 across layers

---
 src/nf/nf_layer.f90             |  1 +
 src/nf/nf_network_submodule.f90 | 46 +++++++++++++++++++++++++++------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index 517622b0..b12592f3 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -22,6 +22,7 @@ module nf_layer
     integer, allocatable :: layer_shape(:)
     integer, allocatable :: input_layer_shape(:)
     logical :: initialized = .false.
+    class(optimizer_base_type), allocatable :: optimizer
 
   contains
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 60c0e151..876070bc 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -597,12 +597,26 @@ module subroutine train(self, input_data, output_data, batch_size, &
     ! If not provided, we default to SGD with its default settings.
     if (present(optimizer)) then
       self % optimizer = optimizer
+
+      do n = 1, size(self % layers)
+        self % layers(n) % optimizer = optimizer
+      end do
+
     else
       self % optimizer = sgd()
+
+      do n = 1, size(self % layers)
+        self % layers(n) % optimizer = sgd()
+      end do
+
     end if
 
     call self % optimizer % init(self % get_num_params())
 
+    do n = 1, size(self % layers)
+      call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
+    end do
+
     ! Passing the loss instance is optional.
     ! If not provided, we default to quadratic().
     if (present(loss)) then
@@ -662,10 +676,26 @@ module subroutine update(self, optimizer, batch_size)
     if (.not. allocated(self % optimizer)) then
       if (present(optimizer)) then
         self % optimizer = optimizer
+        
+        do n = 1, size(self % layers)
+          self % layers(n) % optimizer = optimizer
+        end do
+
       else
         self % optimizer = sgd()
+
+        do n = 1, size(self % layers)
+          self % layers(n) % optimizer = sgd()
+        end do
+
       end if
+
       call self % optimizer % init(self % get_num_params())
+
+      do n = 1, size(self % layers)
+        call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
+      end do
+
     end if
 
     if (present(batch_size)) then
@@ -699,29 +729,29 @@ module subroutine update(self, optimizer, batch_size)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv2d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(locally_connected1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
       end select