Adding ability to save model checkpoint and best model

saraheisenach · saraheisenach · commit ca8bc2faa2a7 · 2022-12-03T17:46:40.000-05:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,26 +6,11 @@ repos:
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 # isort
-#-   repo: https://github.com/asottile/seed-isort-config
-#    rev: v2.2.0
-#    hooks:
-#    -   id: seed-isort-config
 -   repo: https://github.com/pycqa/isort
     rev: 5.10.1
     hooks:
     -   id: isort
         args: ["--profile", "black"]
-#- repo: https://github.com/pycqa/isort
-#    rev: 5.8.0
-#    hooks:
-#      - id: isort
-#        name: isort (python)
-#      - id: isort
-#        name: isort (cython)
-#        types: [cython]
-#      - id: isort
-#        name: isort (pyi)
-#        types: [pyi]
 # flake8
 -   repo: https://github.com/pycqa/flake8
     rev: 5.0.4
diff --git a/matdeeplearn/trainers/base_trainer.py b/matdeeplearn/trainers/base_trainer.py
@@ -1,3 +1,4 @@
+import copy
 import csv
 import logging
 import os
@@ -60,6 +61,7 @@ def __init__(
         self.metrics = {}
         self.epoch_time = None
         self.best_val_metric = 1e10
+        self.best_model_state = None
 
         self.evaluator = Evaluator()
 
@@ -218,6 +220,44 @@ def validate(self):
     def predict(self):
         """Implemented by derived classes."""
 
+    def update_best_model(self, val_metrics):
+        """Updates the best val metric and model, saves the best model, and saves the best model predictions"""
+        self.best_val_metric = val_metrics[type(self.loss_fn).__name__]["metric"]
+        self.best_model_state = copy.deepcopy(self.model.state_dict())
+
+        self.save_model("best_checkpoint.pt", val_metrics, False)
+
+        logging.debug(
+            f"Saving prediction results for epoch {self.epoch} to: /results/{self.timestamp_id}/"
+        )
+        self.predict(self.train_loader, "train")
+        self.predict(self.val_loader, "val")
+        self.predict(self.test_loader, "test")
+
+    def save_model(self, checkpoint_file, val_metrics=None, training_state=True):
+        """Saves the model state dict"""
+
+        if training_state:
+            state = {
+                "epoch": self.epoch,
+                "step": self.step,
+                "state_dict": self.model.state_dict(),
+                "optimizer": self.optimizer.state_dict(),
+                "scheduler": self.scheduler.scheduler.state_dict(),
+                "best_val_metric": self.best_val_metric,
+            }
+        else:
+            state = {"state_dict": self.model.state_dict(), "val_metrics": val_metrics}
+
+        checkpoint_dir = os.path.join(
+            self.run_dir, "results", self.timestamp_id, "checkpoint"
+        )
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        filename = os.path.join(checkpoint_dir, checkpoint_file)
+
+        torch.save(state, filename)
+        return filename
+
     def save_results(self, output, filename, node_level_predictions=False):
         results_path = os.path.join(self.run_dir, "results", self.timestamp_id)
         os.makedirs(results_path, exist_ok=True)
@@ -237,3 +277,9 @@ def save_results(self, output, filename, node_level_predictions=False):
                     csvwriter.writerow(headers)
                 elif i > 0:
                     csvwriter.writerow(output[i - 1, :])
+        return filename
+
+    def load_checkpoint(self):
+        """Loads the model from a checkpoint.pt file"""
+        # TODO: implement this method
+        pass
diff --git a/matdeeplearn/trainers/property_trainer.py b/matdeeplearn/trainers/property_trainer.py
@@ -81,8 +81,11 @@ def train(self):
                 _metrics = self._compute_metrics(out, batch, _metrics)
                 self.metrics = self.evaluator.update("loss", loss.item(), _metrics)
 
+            # TODO: could add param to eval and save on increments instead of every time
+            # Save current model
+            self.save_model(checkpoint_file="checkpoint.pt", training_state=True)
+
             # Evaluate on validation set if it exists
-            # TODO: could add param to eval on increments instead of every time
             if self.val_loader:
                 val_metrics = self.validate()
 
@@ -92,25 +95,18 @@ def train(self):
                 if epoch % self.train_verbosity == 0:
                     self._log_metrics(val_metrics)
 
-                # update best_val_metric and save predicted outputs for train, test, val
-                # TODO save checkpoint if metric is best so far
+                # Update best val metric and model, and save best model and predicted outputs
                 if (
                     val_metrics[type(self.loss_fn).__name__]["metric"]
                     < self.best_val_metric
                 ):
-                    self.best_val_metric = val_metrics[type(self.loss_fn).__name__][
-                        "metric"
-                    ]
-                    logging.debug(
-                        f"Saving prediction results for epoch {epoch} to: /results/{self.timestamp_id}/"
-                    )
-                    self.predict(self.train_loader, "train")
-                    self.predict(self.val_loader, "val")
-                    self.predict(self.test_loader, "test")
+                    self.update_best_model(val_metrics)
 
                 # step scheduler, using validation error
                 self._scheduler_step()
 
+        return self.best_model_state
+
     def validate(self, split="val"):
         self.model.eval()
         evaluator, metrics = Evaluator(), {}