feat: extract resuming to utils

ArcaLunar · ArcaLunar · commit b45398848633 · 2026-03-23T19:34:22.000+08:00
diff --git a/example/common/utils.cc b/example/common/utils.cc
@@ -1,5 +1,10 @@
 #include "example/common/utils.h"
 
+#include "gflags/gflags.h"
+#include "gflags/gflags_declare.h"
+#include "glog/logging.h"
+#include "infini_train/include/nn/parallel/global.h"
+
 namespace infini_train {
 
 float ConvertBF16ToFloat(void *ptr) {
@@ -61,4 +66,53 @@ void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t s
     ifs.seekg(base + std::streamoff(len * sizeof(float)));
 }
 
+std::tuple<int, float, size_t> ResumeFromCheckpoint(
+    const fLS::clstring &flag_resume_root, // resume from this checkpoint directory
+    const nn::parallel::Rank &rank,        // rank info for distributed training
+    std::shared_ptr<nn::Module> model,     // model to be loaded with checkpoint state
+    std::shared_ptr<Optimizer> optimizer,  // some optimizer may not have state, but others may have
+    DistributedDataLoader &train_loader,   // distributed dataloader to be resumed
+    TrainerState &state,                   // trainer state to be loaded from checkpoint
+    DataLoaderIterator
+        &train_iter, // dataloader iterator to be set to the correct position according to checkpoint state
+    CheckpointLoadOptions model_bin_loader) {
+    int global_step = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    size_t data_batch_idx = 0;
+
+    int ddp_world_size = nn::parallel::global::GetDataParallelSize();
+
+    if (flag_resume_root.empty()) {
+        LOG(INFO) << "No checkpoint specified for resume. Starting training from scratch.";
+        return {global_step, best_loss, data_batch_idx};
+    }
+
+    std::filesystem::path resume_dir = flag_resume_root;
+    if (rank.IsParallel()) {
+        const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
+        if (std::filesystem::exists(rank_dir)) {
+            resume_dir = rank_dir;
+        }
+    }
+
+    Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, model_bin_loader);
+
+    global_step = static_cast<int>(state.global_step);
+    best_loss = state.best_loss;
+    if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
+        LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
+                                    "Proceeding with recorded data_batch_idx {}.",
+                                    state.data_batch_stride, ddp_world_size, state.data_batch_idx);
+    }
+    data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
+    train_iter = train_loader.IteratorAtBatchIndex(data_batch_idx);
+    if (rank.IsMainRank()) {
+        LOG(INFO) << std::format(
+            "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}", state.global_step,
+            state.best_loss, state.last_lr, state.data_batch_idx);
+    }
+
+    return {global_step, best_loss, data_batch_idx};
+}
+
 } // namespace infini_train
diff --git a/example/common/utils.h b/example/common/utils.h
@@ -1,8 +1,19 @@
 #pragma once
 
+#include "infini_train/include/checkpoint.h"
+#include "infini_train/include/dataloader.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/rank.h"
+#include "infini_train/include/optimizer.h"
+
+#include "gflags/gflags.h"
+
 #include <cstdint>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
+#include <functional>
+#include <tuple>
 #include <vector>
 
 namespace infini_train {
@@ -30,4 +41,19 @@ void ReadVectorAllFloat(std::ifstream &ifs, float *dst, int64_t len);
 
 void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t start, int64_t cnt);
 
+/**
+ * @returns a tuple of (global_step, best_loss, data_batch_idx) loaded from the checkpoint, which can be used to resume
+ * training.
+ */
+std::tuple<int, float, size_t> ResumeFromCheckpoint(
+    const fLS::clstring &flag_resume_root, // resume from this checkpoint directory
+    const nn::parallel::Rank &rank,        // rank info for distributed training
+    std::shared_ptr<nn::Module> model,     // model to be loaded with checkpoint state
+    std::shared_ptr<Optimizer> optimizer,  // some optimizer may not have state, but others may have
+    DistributedDataLoader &train_loader,   // distributed dataloader to be resumed
+    TrainerState &state,                   // trainer state to be loaded from checkpoint
+    DataLoaderIterator
+        &train_iter, // dataloader iterator to be set to the correct position according to checkpoint state
+    CheckpointLoadOptions model_bin_loader);
+
 } // namespace infini_train
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -9,6 +9,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "example/common/utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
@@ -337,44 +338,15 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     float best_loss = std::numeric_limits<float>::infinity();
-    if (!FLAGS_resume_from.empty()) {
-        std::filesystem::path resume_dir = FLAGS_resume_from;
-        if (rank.IsParallel()) {
-            const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
-            if (std::filesystem::exists(rank_dir)) {
-                resume_dir = rank_dir;
-            }
-        }
-
-        TrainerState state;
-        CheckpointLoadOptions load_options;
-        load_options.load_optimizer_state = true;
-        if (FLAGS_use_llmc_checkpoint_io) {
-            load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-                auto loaded_model = GPT2::FromLLMC(model_path.string());
-                target_model->LoadStateDict(loaded_model->StateDict());
-            };
-        }
-        Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, load_options);
-        start_step = static_cast<int>(state.global_step);
-        best_loss = state.best_loss;
-        if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
-            LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
-                                        "Proceeding with recorded data_batch_idx {}.",
-                                        state.data_batch_stride, ddp_world_size, state.data_batch_idx);
-        }
-        saved_data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
-        train_iter = train_loader.IteratorAtBatchIndex(saved_data_batch_idx);
-        if (rank.IsMainRank()) {
-            LOG(INFO) << std::format(
-                "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}",
-                state.global_step, state.best_loss, state.last_lr, state.data_batch_idx);
-            LOG(INFO) << std::format("Checkpoint model I/O mode during resume: {}",
-                                     FLAGS_use_llmc_checkpoint_io ? "llmc-callback" : "native-state-dict");
-        }
-    }
-
-    LOG(INFO) << "start training";
+    TrainerState state;
+    CheckpointLoadOptions load_options;
+    load_options.load_optimizer_state = true;
+    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
+        auto loaded_model = GPT2::FromLLMC(model_path.string());
+        target_model->LoadStateDict(loaded_model->StateDict());
+    };
+    std::tie(start_step, best_loss, saved_data_batch_idx) = infini_train::ResumeFromCheckpoint(
+        FLAGS_resume_from, rank, model, optimizer, train_loader, state, train_iter, load_options);
 
     auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
                                bool prune_step_checkpoints) {
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -7,6 +7,7 @@
 #include <optional>
 #include <unordered_set>
 
+#include "example/common/utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
@@ -306,38 +307,15 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     float best_loss = std::numeric_limits<float>::infinity();
-    if (!FLAGS_resume_from.empty()) {
-        std::filesystem::path resume_dir = FLAGS_resume_from;
-        if (rank.IsParallel()) {
-            const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
-            if (std::filesystem::exists(rank_dir)) {
-                resume_dir = rank_dir;
-            }
-        }
-
-        TrainerState state;
-        CheckpointLoadOptions load_options;
-        load_options.load_optimizer_state = true;
-        load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-            auto loaded_model = LLaMA3::FromLLMC(model_path.string());
-            target_model->LoadStateDict(loaded_model->StateDict());
-        };
-        Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, load_options);
-        start_step = static_cast<int>(state.global_step);
-        best_loss = state.best_loss;
-        if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
-            LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
-                                        "Proceeding with recorded data_batch_idx {}.",
-                                        state.data_batch_stride, ddp_world_size, state.data_batch_idx);
-        }
-        saved_data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
-        train_iter = train_loader.IteratorAtBatchIndex(saved_data_batch_idx);
-        if (rank.IsMainRank()) {
-            LOG(INFO) << std::format(
-                "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}",
-                state.global_step, state.best_loss, state.last_lr, state.data_batch_idx);
-        }
-    }
+    TrainerState state;
+    CheckpointLoadOptions load_options;
+    load_options.load_optimizer_state = true;
+    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
+        auto loaded_model = LLaMA3::FromLLMC(model_path.string());
+        target_model->LoadStateDict(loaded_model->StateDict());
+    };
+    std::tie(start_step, best_loss, saved_data_batch_idx) = infini_train::ResumeFromCheckpoint(
+        FLAGS_resume_from, rank, model, optimizer, train_loader, state, train_iter, load_options);
 
     auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
                                bool prune_step_checkpoints) {