Skip to content

Commit d22a8f2

Browse files
Merge pull request mala-project#648 from RandomDefaultUser/activation_list_overhaul
Overhaul activation list
2 parents 679da78 + 76b1017 commit d22a8f2

14 files changed

Lines changed: 134 additions & 69 deletions

examples/advanced/ex01_checkpoint_training.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def initial_setup():
2222
parameters.data.data_splitting_type = "by_snapshot"
2323
parameters.data.input_rescaling_type = "feature-wise-standard"
2424
parameters.data.output_rescaling_type = "minmax"
25-
parameters.network.layer_activations = ["ReLU"]
25+
parameters.network.layer_activations = "ReLU"
2626
parameters.running.max_number_epochs = 9
2727
parameters.running.mini_batch_size = 8
2828
parameters.running.learning_rate = 0.00001

examples/advanced/ex02_shuffle_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
parameters.verbosity = 1
2424
parameters.data.input_rescaling_type = "feature-wise-standard"
2525
parameters.data.output_rescaling_type = "minmax"
26-
parameters.network.layer_activations = ["ReLU"]
26+
parameters.network.layer_activations = "ReLU"
2727

2828
# No real training, just showing how shuffling directly before training works.
2929
parameters.running.max_number_epochs = 5

examples/advanced/ex03_tensor_board.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
parameters.targets.ldos_gridsize = 11
1818
parameters.targets.ldos_gridspacing_ev = 2.5
1919
parameters.targets.ldos_gridoffset_ev = -5
20-
parameters.network.layer_activations = ["ReLU"]
20+
parameters.network.layer_activations = "ReLU"
2121
parameters.running.max_number_epochs = 100
2222
parameters.running.mini_batch_size = 40
2323
parameters.running.learning_rate = 0.001

examples/basic/ex01_train_network.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
parameters.data.input_rescaling_type = "feature-wise-standard"
2323
parameters.data.output_rescaling_type = "minmax"
2424
# Specify the used activation function.
25-
parameters.network.layer_activations = ["ReLU"]
25+
parameters.network.layer_activations = "ReLU"
2626
# Specify the training parameters.
2727
# These may be determined via hyperparameter tuning.
2828
parameters.running.max_number_epochs = 100

mala/common/parameters.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -344,16 +344,33 @@ class ParametersNetwork(ParametersBase):
344344
network. Please note that the input layer is included therein.
345345
Default: [10,10,0]
346346
347-
layer_activations : list
348-
A list of strings detailing the activation functions to be used
349-
by the neural network. If the dimension of layer_activations is
350-
smaller than the dimension of layer_sizes-1, than the first entry
351-
is used for all layers.
352-
Currently supported activation functions are:
353-
354-
- Sigmoid (default)
355-
- ReLU
356-
- LeakyReLU
347+
layer_activations : list or str or class or nn.Module
348+
Detailing the activation functions to be used
349+
by the neural network. If a single object is supplied, then this
350+
activation function is used for all layers (whether this applies to the
351+
output layer is controlled by layer_activations_include_output_layer).
352+
Otherwise, the activation functions are added layer by layer.
353+
Note that no activation function is applied between input layer and
354+
first hidden layer!
355+
The items in the list can either be strings (=names of torch.nn.Module
356+
activation functions), which MALA will map to the correct activation
357+
functions, torch.nn.Module objects, torch.nn.Module classes (which MALA
358+
will instantiate) OR None, in which case no activation function is
359+
used.
360+
The None can be ommitted at the end, but is useful when layers without
361+
activation functions are to be skipped in the middle.
362+
Note that output from the output layer is by default restricted to
363+
only have positive values via restrict_targets in the ParameterTargets
364+
subclass. This is similar to having a ReLU function as a final
365+
activation function and ensures the physicality of the outputs (since
366+
the (L)DOS can never be negative).
367+
368+
layer_activations_include_output_layer : bool
369+
If False, no activation function is added to the output layer. This
370+
can of course also be done by supplying just the right amount of
371+
activation functions and this parameter mainly exist to control the
372+
last layer of activation functions in the case of using
373+
layer_activations with only a single object.
357374
358375
loss_function_type : string
359376
Loss function for the neural network
@@ -388,7 +405,8 @@ def __init__(self):
388405
super(ParametersNetwork, self).__init__()
389406
self.nn_type = "feed-forward"
390407
self.layer_sizes = [10, 10, 10]
391-
self.layer_activations = ["Sigmoid"]
408+
self.layer_activations = "LeakyReLU"
409+
self.layer_activations_include_output_layer = True
392410
self.loss_function_type = "mse"
393411

394412
# for LSTM/Gru

mala/network/network.py

Lines changed: 61 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,6 @@ def __init__(self, params: Parameters):
9494
# initialize the parent class
9595
super(Network, self).__init__()
9696

97-
# Mappings for parsing of the activation layers.
98-
self._activation_mappings = {
99-
"Sigmoid": nn.Sigmoid,
100-
"ReLU": nn.ReLU,
101-
"LeakyReLU": nn.LeakyReLU,
102-
"Tanh": nn.Tanh,
103-
}
104-
10597
# initialize the layers
10698
self.number_of_layers = len(self.params.layer_sizes) - 1
10799

@@ -231,22 +223,27 @@ def __init__(self, params):
231223
# We should NOT modify the list itself. This would break the
232224
# hyperparameter algorithms.
233225
use_only_one_activation_type = False
234-
if len(self.params.layer_activations) == 1:
235-
use_only_one_activation_type = True
236-
elif len(self.params.layer_activations) < self.number_of_layers:
237-
raise Exception("Not enough activation layers provided.")
238-
elif len(self.params.layer_activations) > self.number_of_layers:
239-
printout(
240-
"Too many activation layers provided. The last",
241-
str(
226+
if isinstance(self.params.layer_activations, list):
227+
if len(self.params.layer_activations) > self.number_of_layers:
228+
229+
number_of_ignored_layers = (
242230
len(self.params.layer_activations) - self.number_of_layers
243-
),
244-
"activation function(s) will be ignored.",
245-
min_verbosity=1,
246-
)
231+
)
232+
number_of_ignored_layers += (
233+
1
234+
if self.params.layer_activations_include_output_layer
235+
is False
236+
else 0
237+
)
238+
printout(
239+
"Too many activation layers provided. The last",
240+
str(number_of_ignored_layers),
241+
"activation function(s) will be ignored.",
242+
min_verbosity=1,
243+
)
247244

248245
# Add the layers.
249-
# As this is a feedforward layer we always add linear layers, and then
246+
# As this is a feedforward NN we always add linear layers, and then
250247
# an activation function
251248
for i in range(0, self.number_of_layers):
252249
self.layers.append(
@@ -257,21 +254,24 @@ def __init__(self, params):
257254
)
258255
)
259256
)
260-
try:
261-
if use_only_one_activation_type:
262-
self.layers.append(
263-
self._activation_mappings[
264-
self.params.layer_activations[0]
265-
]()
266-
)
267-
else:
268-
self.layers.append(
269-
self._activation_mappings[
257+
if (
258+
i < self.number_of_layers - 1
259+
) or self.params.layer_activations_include_output_layer:
260+
try:
261+
if isinstance(self.params.layer_activations, list):
262+
self._append_activation_function(
270263
self.params.layer_activations[i]
271-
]()
272-
)
273-
except KeyError:
274-
raise Exception("Invalid activation type seleceted.")
264+
)
265+
else:
266+
self._append_activation_function(
267+
self.params.layer_activations
268+
)
269+
270+
except KeyError:
271+
raise Exception("Invalid activation type seleceted.")
272+
except IndexError:
273+
# No activation functions left to append at the end.
274+
pass
275275

276276
# Once everything is done, we can move the Network on the target
277277
# device.
@@ -296,6 +296,30 @@ def forward(self, inputs):
296296
inputs = layer(inputs)
297297
return inputs
298298

299+
def _append_activation_function(self, activation_function):
300+
"""
301+
Append an activation function to the network.
302+
303+
Parameters
304+
----------
305+
activation_function : str or nn.Module or class
306+
Activation function to be appended.
307+
"""
308+
if activation_function is None:
309+
pass
310+
elif isinstance(activation_function, str):
311+
try:
312+
self.layers.append(getattr(torch.nn, activation_function)())
313+
except AttributeError:
314+
raise Exception(
315+
"Torch does not contain the specified "
316+
"activation function: " + activation_function
317+
)
318+
elif isinstance(activation_function, nn.Module):
319+
self.layers.append(activation_function)
320+
elif issubclass(activation_function, nn.Module):
321+
self.layers.append(activation_function())
322+
299323

300324
class LSTM(Network):
301325
"""Initialize this network as a LSTM network."""
@@ -339,9 +363,7 @@ def __init__(self, params):
339363
self.params.num_hidden_layers,
340364
batch_first=True,
341365
)
342-
self.activation = self._activation_mappings[
343-
self.params.layer_activations[0]
344-
]()
366+
self.activation = getattr(torch.nn, self.params.layer_activations[0])()
345367

346368
self.batch_size = None
347369
# Once everything is done, we can move the Network on the target
@@ -477,9 +499,7 @@ def __init__(self, params):
477499
self.params.num_hidden_layers,
478500
batch_first=True,
479501
)
480-
self.activation = self._activation_mappings[
481-
self.params.layer_activations[0]
482-
]()
502+
self.activation = getattr(torch.nn, self.params.layer_activations[0])()
483503

484504
if params.use_gpu:
485505
self.to("cuda")

mala/network/runner.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,12 +637,24 @@ def load_run(
637637
load_with_mpi=None,
638638
load_with_gpu=None,
639639
load_with_ddp=None,
640+
activation_list_legacy_load=False,
640641
):
641642
"""
642643
Load a run.
643644
644645
Parameters
645646
----------
647+
activation_list_legacy_load : bool
648+
Enables correct loading of old (<v1.3.1) MALA models, if these
649+
models used only a single activation function for all layers.
650+
In older MALA versions, the usage of the same activation function
651+
across all layers was denoted via a list with only a single
652+
element. Beginning with MALA v1.3.1, this is denoted by
653+
the layer_activations being a single string rather than a list.
654+
If activation_list_legacy_load is set to True, and a list with
655+
a single element is found in the stored model, than this list will
656+
be transformed into a single string.
657+
646658
run_name : str
647659
Name under which the run is saved.
648660
@@ -739,6 +751,18 @@ def load_run(
739751
loaded_params, force_no_ddp=True
740752
)
741753

754+
# In older MALA versions, the usage of the same activation function
755+
# across all layers was denoted via a list with only a single element.
756+
# This was changed prior to the release of v1.3.1, and a list with
757+
# a single element is now interpreted as exactly that. For backwards
758+
# compatability the expected behavior can be recovered by extracting
759+
# the one and only element of the list.
760+
if activation_list_legacy_load:
761+
if len(loaded_params.network.layer_activations) == 1:
762+
loaded_params.network.layer_activations = (
763+
loaded_params.network.layer_activations[0]
764+
)
765+
742766
# MPI has to be specified upon loading, in contrast to GPU.
743767
if load_with_mpi is not None:
744768
loaded_params.use_mpi = load_with_mpi

test/all_lazy_loading_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_scaling(self):
3434
test_parameters.data.data_splitting_type = "by_snapshot"
3535
test_parameters.descriptors.bispectrum_twojmax = 11
3636
test_parameters.targets.ldos_gridsize = 10
37-
test_parameters.network.layer_activations = ["LeakyReLU"]
37+
test_parameters.network.layer_activations = "LeakyReLU"
3838
test_parameters.running.max_number_epochs = 3
3939
test_parameters.running.mini_batch_size = 512
4040
test_parameters.running.learning_rate = 0.00001
@@ -256,7 +256,7 @@ def _train_lazy_loading(prefetching):
256256
test_parameters.data.data_splitting_type = "by_snapshot"
257257
test_parameters.data.input_rescaling_type = "feature-wise-standard"
258258
test_parameters.data.output_rescaling_type = "minmax"
259-
test_parameters.network.layer_activations = ["ReLU"]
259+
test_parameters.network.layer_activations = "ReLU"
260260
test_parameters.manual_seed = 1234
261261
test_parameters.running.max_number_epochs = 100
262262
test_parameters.running.mini_batch_size = 40

test/basic_gpu_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __run(use_gpu):
8585
test_parameters.data.output_rescaling_type = "minmax"
8686

8787
# Specify the used activation function.
88-
test_parameters.network.layer_activations = ["ReLU"]
88+
test_parameters.network.layer_activations = "ReLU"
8989

9090
# Specify the training parameters.
9191
test_parameters.running.max_number_epochs = 100

test/checkpoint_training_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def __original_setup(
140140
test_parameters.data.output_rescaling_type = "minmax"
141141

142142
# Specify the used activation function.
143-
test_parameters.network.layer_activations = ["ReLU"]
143+
test_parameters.network.layer_activations = "ReLU"
144144

145145
# Specify the training parameters.
146146
test_parameters.running.max_number_epochs = maxepochs

0 commit comments

Comments
 (0)