Merge pull request mala-project#648 from RandomDefaultUser/activation_list_overhaul

RandomDefaultUser · web-flow · commit d22a8f28ae96 · 2025-04-02T11:31:40.000+02:00
Overhaul activation list
diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py
@@ -22,7 +22,7 @@ def initial_setup():
     parameters.data.data_splitting_type = "by_snapshot"
     parameters.data.input_rescaling_type = "feature-wise-standard"
     parameters.data.output_rescaling_type = "minmax"
-    parameters.network.layer_activations = ["ReLU"]
+    parameters.network.layer_activations = "ReLU"
     parameters.running.max_number_epochs = 9
     parameters.running.mini_batch_size = 8
     parameters.running.learning_rate = 0.00001
diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py
@@ -23,7 +23,7 @@
 parameters.verbosity = 1
 parameters.data.input_rescaling_type = "feature-wise-standard"
 parameters.data.output_rescaling_type = "minmax"
-parameters.network.layer_activations = ["ReLU"]
+parameters.network.layer_activations = "ReLU"
 
 # No real training, just showing how shuffling directly before training works.
 parameters.running.max_number_epochs = 5
diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py
@@ -17,7 +17,7 @@
 parameters.targets.ldos_gridsize = 11
 parameters.targets.ldos_gridspacing_ev = 2.5
 parameters.targets.ldos_gridoffset_ev = -5
-parameters.network.layer_activations = ["ReLU"]
+parameters.network.layer_activations = "ReLU"
 parameters.running.max_number_epochs = 100
 parameters.running.mini_batch_size = 40
 parameters.running.learning_rate = 0.001
diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py
@@ -22,7 +22,7 @@
 parameters.data.input_rescaling_type = "feature-wise-standard"
 parameters.data.output_rescaling_type = "minmax"
 # Specify the used activation function.
-parameters.network.layer_activations = ["ReLU"]
+parameters.network.layer_activations = "ReLU"
 # Specify the training parameters.
 # These may be determined via hyperparameter tuning.
 parameters.running.max_number_epochs = 100
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
@@ -344,16 +344,33 @@ class ParametersNetwork(ParametersBase):
         network. Please note that the input layer is included therein.
         Default: [10,10,0]
 
-    layer_activations : list
-        A list of strings detailing the activation functions to be used
-        by the neural network. If the dimension of layer_activations is
-        smaller than the dimension of layer_sizes-1, than the first entry
-        is used for all layers.
-        Currently supported activation functions are:
-
-            - Sigmoid (default)
-            - ReLU
-            - LeakyReLU
+    layer_activations : list or str or class or nn.Module
+        Detailing the activation functions to be used
+        by the neural network. If a single object is supplied, then this
+        activation function is used for all layers (whether this applies to the
+        output layer is controlled by layer_activations_include_output_layer).
+        Otherwise, the activation functions are added layer by layer.
+        Note that no activation function is applied between input layer and
+        first hidden layer!
+        The items in the list can either be strings (=names of torch.nn.Module
+        activation functions), which MALA will map to the correct activation
+        functions, torch.nn.Module objects, torch.nn.Module classes (which MALA
+        will instantiate) OR None, in which case no activation function is
+        used.
+        The None can be ommitted at the end, but is useful when layers without
+        activation functions are to be skipped in the middle.
+        Note that output from the output layer is by default restricted to
+        only have positive values via restrict_targets in the ParameterTargets
+        subclass. This is similar to having a ReLU function as a final
+        activation function and ensures the physicality of the outputs (since
+        the (L)DOS can never be negative).
+
+    layer_activations_include_output_layer : bool
+        If False, no activation function is added to the output layer. This
+        can of course also be done by supplying just the right amount of
+        activation functions and this parameter mainly exist to control the
+        last layer of activation functions in the case of using
+        layer_activations with only a single object.
 
     loss_function_type : string
         Loss function for the neural network
@@ -388,7 +405,8 @@ def __init__(self):
         super(ParametersNetwork, self).__init__()
         self.nn_type = "feed-forward"
         self.layer_sizes = [10, 10, 10]
-        self.layer_activations = ["Sigmoid"]
+        self.layer_activations = "LeakyReLU"
+        self.layer_activations_include_output_layer = True
         self.loss_function_type = "mse"
 
         # for LSTM/Gru
diff --git a/mala/network/network.py b/mala/network/network.py
@@ -94,14 +94,6 @@ def __init__(self, params: Parameters):
         # initialize the parent class
         super(Network, self).__init__()
 
-        # Mappings for parsing of the activation layers.
-        self._activation_mappings = {
-            "Sigmoid": nn.Sigmoid,
-            "ReLU": nn.ReLU,
-            "LeakyReLU": nn.LeakyReLU,
-            "Tanh": nn.Tanh,
-        }
-
         # initialize the layers
         self.number_of_layers = len(self.params.layer_sizes) - 1
 
@@ -231,22 +223,27 @@ def __init__(self, params):
         # We should NOT modify the list itself. This would break the
         # hyperparameter algorithms.
         use_only_one_activation_type = False
-        if len(self.params.layer_activations) == 1:
-            use_only_one_activation_type = True
-        elif len(self.params.layer_activations) < self.number_of_layers:
-            raise Exception("Not enough activation layers provided.")
-        elif len(self.params.layer_activations) > self.number_of_layers:
-            printout(
-                "Too many activation layers provided. The last",
-                str(
+        if isinstance(self.params.layer_activations, list):
+            if len(self.params.layer_activations) > self.number_of_layers:
+
+                number_of_ignored_layers = (
                     len(self.params.layer_activations) - self.number_of_layers
-                ),
-                "activation function(s) will be ignored.",
-                min_verbosity=1,
-            )
+                )
+                number_of_ignored_layers += (
+                    1
+                    if self.params.layer_activations_include_output_layer
+                    is False
+                    else 0
+                )
+                printout(
+                    "Too many activation layers provided. The last",
+                    str(number_of_ignored_layers),
+                    "activation function(s) will be ignored.",
+                    min_verbosity=1,
+                )
 
         # Add the layers.
-        # As this is a feedforward layer we always add linear layers, and then
+        # As this is a feedforward NN we always add linear layers, and then
         # an activation function
         for i in range(0, self.number_of_layers):
             self.layers.append(
@@ -257,21 +254,24 @@ def __init__(self, params):
                     )
                 )
             )
-            try:
-                if use_only_one_activation_type:
-                    self.layers.append(
-                        self._activation_mappings[
-                            self.params.layer_activations[0]
-                        ]()
-                    )
-                else:
-                    self.layers.append(
-                        self._activation_mappings[
+            if (
+                i < self.number_of_layers - 1
+            ) or self.params.layer_activations_include_output_layer:
+                try:
+                    if isinstance(self.params.layer_activations, list):
+                        self._append_activation_function(
                             self.params.layer_activations[i]
-                        ]()
-                    )
-            except KeyError:
-                raise Exception("Invalid activation type seleceted.")
+                        )
+                    else:
+                        self._append_activation_function(
+                            self.params.layer_activations
+                        )
+
+                except KeyError:
+                    raise Exception("Invalid activation type seleceted.")
+                except IndexError:
+                    # No activation functions left to append at the end.
+                    pass
 
         # Once everything is done, we can move the Network on the target
         # device.
@@ -296,6 +296,30 @@ def forward(self, inputs):
             inputs = layer(inputs)
         return inputs
 
+    def _append_activation_function(self, activation_function):
+        """
+        Append an activation function to the network.
+
+        Parameters
+        ----------
+        activation_function : str or nn.Module or class
+            Activation function to be appended.
+        """
+        if activation_function is None:
+            pass
+        elif isinstance(activation_function, str):
+            try:
+                self.layers.append(getattr(torch.nn, activation_function)())
+            except AttributeError:
+                raise Exception(
+                    "Torch does not contain the specified "
+                    "activation function: " + activation_function
+                )
+        elif isinstance(activation_function, nn.Module):
+            self.layers.append(activation_function)
+        elif issubclass(activation_function, nn.Module):
+            self.layers.append(activation_function())
+
 
 class LSTM(Network):
     """Initialize this network as a LSTM network."""
@@ -339,9 +363,7 @@ def __init__(self, params):
                 self.params.num_hidden_layers,
                 batch_first=True,
             )
-        self.activation = self._activation_mappings[
-            self.params.layer_activations[0]
-        ]()
+        self.activation = getattr(torch.nn, self.params.layer_activations[0])()
 
         self.batch_size = None
         # Once everything is done, we can move the Network on the target
@@ -477,9 +499,7 @@ def __init__(self, params):
                 self.params.num_hidden_layers,
                 batch_first=True,
             )
-        self.activation = self._activation_mappings[
-            self.params.layer_activations[0]
-        ]()
+        self.activation = getattr(torch.nn, self.params.layer_activations[0])()
 
         if params.use_gpu:
             self.to("cuda")
diff --git a/mala/network/runner.py b/mala/network/runner.py
@@ -637,12 +637,24 @@ def load_run(
         load_with_mpi=None,
         load_with_gpu=None,
         load_with_ddp=None,
+        activation_list_legacy_load=False,
     ):
         """
         Load a run.
 
         Parameters
         ----------
+        activation_list_legacy_load : bool
+            Enables correct loading of old (<v1.3.1) MALA models, if these
+            models used only a single activation function for all layers.
+            In older MALA versions, the usage of the same activation function
+            across all layers was denoted via a list with only a single
+            element. Beginning with MALA v1.3.1, this is denoted by
+            the layer_activations being a single string rather than a list.
+            If activation_list_legacy_load is set to True, and a list with
+            a single element is found in the stored model, than this list will
+            be transformed into a single string.
+
         run_name : str
             Name under which the run is saved.
 
@@ -739,6 +751,18 @@ def load_run(
                 loaded_params, force_no_ddp=True
             )
 
+        # In older MALA versions, the usage of the same activation function
+        # across all layers was denoted via a list with only a single element.
+        # This was changed prior to the release of v1.3.1, and a list with
+        # a single element is now interpreted as exactly that. For backwards
+        # compatability the expected behavior can be recovered by extracting
+        # the one and only element of the list.
+        if activation_list_legacy_load:
+            if len(loaded_params.network.layer_activations) == 1:
+                loaded_params.network.layer_activations = (
+                    loaded_params.network.layer_activations[0]
+                )
+
         # MPI has to be specified upon loading, in contrast to GPU.
         if load_with_mpi is not None:
             loaded_params.use_mpi = load_with_mpi
diff --git a/test/all_lazy_loading_test.py b/test/all_lazy_loading_test.py
@@ -34,7 +34,7 @@ def test_scaling(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.descriptors.bispectrum_twojmax = 11
         test_parameters.targets.ldos_gridsize = 10
-        test_parameters.network.layer_activations = ["LeakyReLU"]
+        test_parameters.network.layer_activations = "LeakyReLU"
         test_parameters.running.max_number_epochs = 3
         test_parameters.running.mini_batch_size = 512
         test_parameters.running.learning_rate = 0.00001
@@ -256,7 +256,7 @@ def _train_lazy_loading(prefetching):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.manual_seed = 1234
         test_parameters.running.max_number_epochs = 100
         test_parameters.running.mini_batch_size = 40
diff --git a/test/basic_gpu_test.py b/test/basic_gpu_test.py
@@ -85,7 +85,7 @@ def __run(use_gpu):
         test_parameters.data.output_rescaling_type = "minmax"
 
         # Specify the used activation function.
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
 
         # Specify the training parameters.
         test_parameters.running.max_number_epochs = 100
diff --git a/test/checkpoint_training_test.py b/test/checkpoint_training_test.py
@@ -140,7 +140,7 @@ def __original_setup(
         test_parameters.data.output_rescaling_type = "minmax"
 
         # Specify the used activation function.
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
 
         # Specify the training parameters.
         test_parameters.running.max_number_epochs = maxepochs
diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py
@@ -185,7 +185,7 @@ def test_ase_calculator(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 100
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
diff --git a/test/on_the_fly_test.py b/test/on_the_fly_test.py
@@ -24,7 +24,7 @@ def __setup_training(lazy_loading, checkpoint_name=None):
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
         test_parameters.data.use_lazy_loading = lazy_loading
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 5
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
@@ -111,7 +111,7 @@ def test_shuffling(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 5
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
diff --git a/test/shuffling_test.py b/test/shuffling_test.py
@@ -120,7 +120,7 @@ def test_training(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 50
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
@@ -164,7 +164,7 @@ def test_training(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 50
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
@@ -216,7 +216,7 @@ def test_training_openpmd(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 50
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
@@ -262,7 +262,7 @@ def test_training_openpmd(self):
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
-        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.network.layer_activations = "ReLU"
         test_parameters.running.max_number_epochs = 50
         test_parameters.running.mini_batch_size = 40
         test_parameters.running.learning_rate = 0.00001
@@ -370,7 +370,10 @@ def test_arbitrary_number_snapshots(self):
             "Be_shuffled%T.out.h5", opmd.Access.read_only
         )
         for i in range(5):
-            for name, series in [("Bispectrum", bispectrum_series), ("LDOS", ldos_series)]:
+            for name, series in [
+                ("Bispectrum", bispectrum_series),
+                ("LDOS", ldos_series),
+            ]:
                 loaded_array = [
                     component.load_chunk().squeeze()
                     for _, component in series.iterations[i]
diff --git a/test/workflow_test.py b/test/workflow_test.py