InfiniTensor · ArcaLunar · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 23, 2026
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -1,6 +1,9 @@
+#include <algorithm>
 #include <chrono>
 #include <cstdlib>
+#include <filesystem>
 #include <format>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <unordered_map>
@@ -10,6 +13,7 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/autocast.h"
+#include "infini_train/include/checkpoint.h"
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dataloader.h"
 #include "infini_train/include/device.h"
@@ -75,6 +79,14 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
+DEFINE_uint32(save_steps, 0, "save checkpoint every N steps; 0 disables saving");
+DEFINE_string(resume_from, "", "checkpoint directory to resume from");
+DEFINE_string(checkpoint_dir, "./checkpoints", "root directory used to store checkpoints");
+DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
+DEFINE_bool(save_optimizer_state, true, "whether optimizer state is persisted in checkpoints");
+DEFINE_string(checkpoint_format, "bin", "checkpoint format: bin|pth");
+DEFINE_bool(use_llmc_checkpoint_io, false,
+            "whether to use GPT2 LLMC model.bin callback for checkpoint save/load when format=bin");
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -198,6 +210,8 @@ void Train(const nn::parallel::Rank &rank) {
     } else {
         model = GPT2::FromPretrained(kStrToModelType.at(FLAGS_model));
     }
+    auto llmc_model = std::dynamic_pointer_cast<GPT2>(model);
+    CHECK(llmc_model != nullptr) << "Failed to cast model to GPT2 for LLMC checkpoint I/O.";
 
     model->To(device);
 
@@ -311,6 +325,7 @@ void Train(const nn::parallel::Rank &rank) {
     }
 
     auto train_iter = train_loader.begin();
+    size_t saved_data_batch_idx = train_iter.BatchIndex();
     std::shared_ptr<nn::Module> loss_fn
         = (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(
               std::make_shared<VocabParallelCrossEntropyLoss>(model_config.original_vocab_size))
@@ -320,9 +335,100 @@ void Train(const nn::parallel::Rank &rank) {
 
     auto impl = core::GetDeviceGuardImpl(device.type());
 
+    int start_step = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    if (!FLAGS_resume_from.empty()) {
+        std::filesystem::path resume_dir = FLAGS_resume_from;
+        if (rank.IsParallel()) {
+            const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
+            if (std::filesystem::exists(rank_dir)) {
+                resume_dir = rank_dir;
+            }
+        }
+
+        TrainerState state;
+        CheckpointLoadOptions load_options;
+        load_options.load_optimizer_state = true;
+        if (FLAGS_use_llmc_checkpoint_io) {
+            load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
+                auto loaded_model = GPT2::FromLLMC(model_path.string());
+                target_model->LoadStateDict(loaded_model->StateDict());
+            };
+        }
+        Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, load_options);
+        start_step = static_cast<int>(state.global_step);
+        best_loss = state.best_loss;
+        if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
+            LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
+                                        "Proceeding with recorded data_batch_idx {}.",
+                                        state.data_batch_stride, ddp_world_size, state.data_batch_idx);
+        }
+        saved_data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
+        train_iter = train_loader.IteratorAtBatchIndex(saved_data_batch_idx);
+        if (rank.IsMainRank()) {
+            LOG(INFO) << std::format(
+                "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}",
+                state.global_step, state.best_loss, state.last_lr, state.data_batch_idx);
+            LOG(INFO) << std::format("Checkpoint model I/O mode during resume: {}",
+                                     FLAGS_use_llmc_checkpoint_io ? "llmc-callback" : "native-state-dict");
+        }
+    }
+
     LOG(INFO) << "start training";
 
-    for (int step = 0; step < FLAGS_num_iteration + 1; ++step) {
+    auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
+                               bool prune_step_checkpoints) {
+        const auto ckpt_start = std::chrono::high_resolution_clock::now();
+
+        TrainerState state;
+        state.global_step = global_step;
+        state.data_batch_idx = saved_data_batch_idx;
+        state.data_batch_stride = ddp_world_size;
+        state.best_loss = best_loss;
+        state.last_lr = FLAGS_learning_rate;
+        state.optimizer_type = "SGD";
+        state.checkpoint_format = FLAGS_checkpoint_format;
+        state.ddp_size = ddp_world_size;
+        state.tp_size = tp_world_size;
+        state.sp_size = sp_world_size;
+        state.pp_size = pp_world_size;
+
+        CheckpointOptions options;
+        options.format = FLAGS_checkpoint_format;
+        options.save_optimizer_state = FLAGS_save_optimizer_state;
+        if (FLAGS_use_llmc_checkpoint_io) {
+            options.model_bin_writer = [&](const nn::Module &, const std::filesystem::path &model_path) {
+                llmc_model->SaveAsLLMC(model_path.string());
+            };
+        }
+        Checkpoint::Save(save_dir, *model, *optimizer, state, options);
+
+        const auto ckpt_end = std::chrono::high_resolution_clock::now();
+        const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();
+
+        if (rank.IsMainRank()) {
+            LOG(INFO) << std::format("Checkpoint saved at: {} ({:.2f} ms)", save_dir.string(), ckpt_ms);
+
+            if (prune_step_checkpoints) {
+                std::vector<std::filesystem::path> ckpts;
+                const auto root = std::filesystem::path(FLAGS_checkpoint_dir);
+                if (std::filesystem::exists(root)) {
+                    for (const auto &entry : std::filesystem::directory_iterator(root)) {
+                        if (entry.is_directory() && entry.path().filename().string().starts_with("checkpoint_step_")) {
+                            ckpts.push_back(entry.path());
+                        }
+                    }
+                    std::sort(ckpts.begin(), ckpts.end());
+                    while (ckpts.size() > FLAGS_max_checkpoint_keep) {
+                        std::filesystem::remove_all(ckpts.front());
+                        ckpts.erase(ckpts.begin());
+                    }
+                }
+            }
+        }
+    };
+
+    for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) {
         // Reset precision check counters at start of each iteration for file overwrite
         utils::PrecisionChecker::ResetCounters();
 
@@ -372,6 +478,7 @@ void Train(const nn::parallel::Rank &rank) {
                 // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
                 // TODO(dcj): support dataloader.reset() later
                 ++train_iter;
+                saved_data_batch_idx = train_iter.BatchIndex();
                 x = std::make_shared<Tensor>(x->To(device));
                 y = std::make_shared<Tensor>(y->To(device));
 
@@ -401,6 +508,7 @@ void Train(const nn::parallel::Rank &rank) {
             // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
             // TODO(dcj): support dataloader.reset() later
             ++train_iter;
+            saved_data_batch_idx = train_iter.BatchIndex();
             x = std::make_shared<Tensor>(x->To(device));
             y = std::make_shared<Tensor>(y->To(device));
 
@@ -413,6 +521,8 @@ void Train(const nn::parallel::Rank &rank) {
             lossf = static_cast<const float *>(lossf_tensor->To(Device()).DataPtr())[0];
         }
 
+        best_loss = std::min(best_loss, lossf);
+
         const auto iter_end = std::chrono::high_resolution_clock::now();
         const double duration_us = std::chrono::duration<double, std::micro>(iter_end - iter_start).count();
         const double tps = FLAGS_total_batch_size / (duration_us / 1e6);
@@ -435,8 +545,22 @@ void Train(const nn::parallel::Rank &rank) {
                 }
             }
         }
+
+        if (FLAGS_save_steps > 0 && (step + 1) % FLAGS_save_steps == 0) {
+            std::filesystem::path step_dir
+                = std::filesystem::path(FLAGS_checkpoint_dir) / std::format("checkpoint_step_{:06d}", step + 1);
+            if (rank.IsParallel()) {
+                step_dir /= std::format("rank_{:06d}", rank.GlobalRank());
+            }
+            save_checkpoint(step_dir, step + 1, true);
+        }
     }
 
+    std::filesystem::path final_dir = std::filesystem::path(FLAGS_checkpoint_dir) / "checkpoint_final";
+    if (rank.IsParallel()) {
+        final_dir /= std::format("rank_{:06d}", rank.GlobalRank());
+    }
+    save_checkpoint(final_dir, FLAGS_num_iteration, false);
     // Save LoRA weights if enabled and path specified
     if (lora_enabled && !FLAGS_lora_save_path.empty()) {
         LOG(INFO) << "Saving LoRA weights to: " << FLAGS_lora_save_path;

diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
@@ -719,4 +719,112 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
     return local_gpt2;
 }
 
+void GPT2::SaveAsLLMC(const std::string &filepath) const {
+    CHECK_EQ(nn::parallel::global::GetTensorParallelSize(), 1) << "SaveAsLLMC currently supports TP=1 only.";
+    CHECK_EQ(nn::parallel::global::GetPipelineParallelSize(), 1) << "SaveAsLLMC currently supports PP=1 only.";
+
+    std::ofstream ofs(filepath, std::ios::binary);
+    CHECK(ofs.is_open()) << "Failed to open model file for write: " << filepath;
+
+    std::vector<int32_t> header(256, 0);
+    header[0] = kHeaderMagic;
+    header[1] = kHeaderFP32Version;
+    header[2] = static_cast<int32_t>(config_.block_size);
+    header[3] = static_cast<int32_t>(config_.original_vocab_size);
+    header[4] = static_cast<int32_t>(config_.n_layer);
+    header[5] = static_cast<int32_t>(config_.n_head);
+    header[6] = static_cast<int32_t>(config_.n_embd);
+    header[7] = static_cast<int32_t>(config_.vocab_size);
+    ofs.write(reinterpret_cast<const char *>(header.data()),
+              static_cast<std::streamsize>(header.size() * sizeof(int32_t)));
+
+    const auto state_dict = StateDict();
+    auto get_tensor = [&](const std::string &name) -> std::shared_ptr<Tensor> {
+        CHECK(state_dict.contains(name)) << "Missing tensor in GPT2 state_dict: " << name;
+        return state_dict.at(name);
+    };
+
+    auto write_tensor_fp32 = [&](const std::shared_ptr<Tensor> &tensor) {
+        Tensor cpu = tensor->To(Device());
+        if (cpu.Dtype() != DataType::kFLOAT32) {
+            cpu = cpu.To(DataType::kFLOAT32);
+        }
+        const auto bytes = static_cast<std::streamsize>(cpu.SizeInBytes());
+        ofs.write(reinterpret_cast<const char *>(cpu.DataPtr()), bytes);
+    };
+
+    // transformer.wte.weight
+    write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWTELayerName,
+                                             nn::parallel::VocabParallelEmbedding::kParamWeightName)));
+
+    // transformer.wpe.weight
+    write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWPELayerName,
+                                             nn::Embedding::kParamWeightName)));
+
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn1LayerName, nn::LayerNorm::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn1LayerName, nn::LayerNorm::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn2LayerName, nn::LayerNorm::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn2LayerName, nn::LayerNorm::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCFcLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCFcLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamBiasName)));
+    }
+
+    write_tensor_fp32(get_tensor(
+        std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamWeightName)));
+    write_tensor_fp32(get_tensor(
+        std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamBiasName)));
+
+    ofs.flush();
+    CHECK(ofs.good()) << "Failed to flush model file: " << filepath;
+}
+
 int GPT2::GetChunkSize() const { return stage_info_.layer_ranges_per_chunk.size(); }
diff --git a/example/gpt2/net.h b/example/gpt2/net.h
@@ -141,6 +141,7 @@ class GPT2 : public infini_train::nn::CloneableModule<GPT2> {
 
     static std::shared_ptr<GPT2> FromPretrained(ModelType model_type);
     static std::shared_ptr<GPT2> FromLLMC(const std::string &filepath);
+    void SaveAsLLMC(const std::string &filepath) const;
 
     int GetChunkSize() const;