InfiniTensor
diff --git a/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 18 additions & 18 deletions b/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 2 additions & 0 deletions b/‎example/gpt2/main.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 1 addition & 5 deletions b/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎infini_train/include/nn/modules/transformer/causal_self_attention.h‎
Lines changed: 1 addition & 2 deletions b/‎infini_train/include/nn/modules/transformer/causal_self_attention.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎infini_train/include/nn/modules/transformer/layer_specs.h‎
Lines changed: 0 additions & 55 deletions b/‎infini_train/include/nn/modules/transformer/layer_specs.h‎
Lines changed: 0 additions & 55 deletions
diff --git a/‎infini_train/include/nn/modules/transformer/mlp.h‎
Lines changed: 4 additions & 2 deletions b/‎infini_train/include/nn/modules/transformer/mlp.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎infini_train/include/nn/modules/transformer/spec_utils.h‎
Lines changed: 0 additions & 93 deletions b/‎infini_train/include/nn/modules/transformer/spec_utils.h‎
Lines changed: 0 additions & 93 deletions
@@ -17,7 +17,6 @@
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/sparse.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
-#include "infini_train/include/nn/modules/transformer/layer_specs.h"
 #include "infini_train/include/nn/modules/transformer/mlp.h"
 #include "infini_train/include/nn/modules/transformer/transformer.h"
 #include "infini_train/include/nn/parallel/global.h"
@@ -96,10 +95,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     gpt2_config.n_layer = n_layer;
     gpt2_config.n_head = n_head;
     gpt2_config.n_embd = n_embd;
-    auto local_gpt2 = std::make_shared<nn::TransformerModel>(
-        gpt2_config,
-        nn::BuildTransformerSpec(gpt2_config, nn::BuildFirstStageSpec(gpt2_config),
-                                 nn::BuildTransformerLayerSpec(gpt2_config), nn::BuildLastStageSpec(gpt2_config)));
+    auto local_gpt2 = std::make_shared<nn::TransformerModel>(gpt2_config);
 
     LOG(INFO) << "magic: " << magic << " version: " << version << " block_size: " << block_size
               << " vocab_size: " << vocab_size << " n_layer: " << n_layer << " n_head: " << n_head
@@ -140,6 +136,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
 
     auto state_dict = local_gpt2->StateDict();
 
+    printf("===============Model Config:===============\n");
     // transformer.wte.weight (also transformer.lm_head.weight)
     // full: (model_vocab_size, n_embd)
     // local: (vocab_size_per_partition, n_embd)
@@ -158,7 +155,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         size_t wte_bytes = model_vocab_size * n_embd * sizeof(float);
         ifs.seekg(wte_bytes, std::ios::cur);
     }
-
+    printf("Loading wte.weight...\n");
     if (tp_size == 1) {
         // Skip padded vocab part when TP is not enabled
         ifs.ignore((padded_vocab_size - model_vocab_size) * n_embd * sizeof(float));
@@ -174,7 +171,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         size_t wpe_bytes = block_size * n_embd * sizeof(float);
         ifs.seekg(wpe_bytes, std::ios::cur);
     }
-
+    printf("Loading wpe.weight...\n");
     // transformer.h.{i}.ln_1.weight
     int local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -190,7 +187,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(ln_1_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading ln_1.weight...\n");
     // transformer.h.{i}.ln_1.bias
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -205,7 +202,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(ln_1_b_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading ln_1.bias...\n");
     // transformer.h.{i}.attn.c_attn.weight (ColumnParallelLinear, but actually applies on "rows")
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -248,7 +245,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_attn_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading c_attn.weight...\n");
     // transformer.h.{i}.attn.c_attn.bias (ColumnParallelLinear)
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -290,7 +287,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_attn_b_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading c_attn.bias...\n");
     // transformer.h.{i}.attn.c_proj.weight (RowParallelLinear, but actually applies on "columns")
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -307,7 +304,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_proj_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading c_proj.weight...\n");
     // transformer.h.{i}.attn.c_proj.bias (RowParallelLinear, no shard on bias)
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -323,7 +320,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_proj_b_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading ln_2.weight...\n");
     // transformer.h.{i}.ln_2.weight
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -339,7 +336,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(ln_2_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading ln_2.bias...\n");
     // transformer.h.{i}.ln_2.bias
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -354,7 +351,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(ln_2_b_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading mlp.c_fc.weight...\n");
     // transformer.h.{i}.mlp.c_fc.weight (ColumnParallelLinear, but actually applies on "rows")
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -370,7 +367,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_fc_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading mlp.c_fc.bias...\n");
     // transformer.h.{i}.mlp.c_fc.bias (ColumnParallelLinear)
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -386,7 +383,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_fc_b_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading mlp.c_proj.weight...\n");
     // transformer.h.{i}.mlp.c_proj.weight (RowParallelLinear, but actually applies on "columns")
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -403,7 +400,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
             ifs.seekg(c_proj_w_bytes, std::ios::cur);
         }
     }
-
+    printf("Loading mlp.c_proj.bias...\n");
     // transformer.h.{i}.mlp.c_proj.bias (RowParallelLinear, no shard on bias)
     local_layer_index = 0;
     for (int idx = 0; idx < n_layer; ++idx) {
@@ -420,6 +417,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         }
     }
 
+    printf("Loading is_last_stage...\n");
     if (is_last_stage) {
         // transformer.ln_f.weight
         auto &transformer_ln_f_weight
@@ -436,6 +434,8 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         size_t ln_f_b_bytes = n_embd * sizeof(float);
         ifs.seekg(ln_f_w_bytes + ln_f_b_bytes, std::ios::cur);
     }
+
+    printf("Finished loading checkpoint from %s\n", filepath.c_str());
     return local_gpt2;
 }
 } // namespace gpt2
@@ -188,6 +188,7 @@ void Train(const nn::parallel::Rank &rank) {
 
     if (!FLAGS_llmc_filepath.empty()) {
         model = gpt2::LoadFromLLMC(FLAGS_llmc_filepath);
+        printf("Loaded model from LLMC checkpoint: %s\n", FLAGS_llmc_filepath.c_str());
     } else if (kModelToConfigs.count(FLAGS_model)) {
         model_config = kModelToConfigs.at(FLAGS_model);
         model = std::make_shared<nn::TransformerModel>(model_config);
@@ -370,6 +371,7 @@ void Train(const nn::parallel::Rank &rank) {
                 y = std::make_shared<Tensor>(y->To(device));
 
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": start forward";
+
                 // (bs, seq_len, vocab_size)
                 auto logits = (*model)({x, y})[0];
                 LOG(INFO) << "Rank " << rank.GlobalRank() << ": finish model forward, start loss forward";
 
@@ -16,7 +16,6 @@
 #include "example/llama3/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
-#include "infini_train/include/nn/modules/transformer/layer_specs.h"
 #include "infini_train/include/nn/modules/transformer/mlp.h"
 #include "infini_train/include/nn/modules/transformer/transformer.h"
 #include "infini_train/include/nn/parallel/global.h"
@@ -90,10 +89,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     llama3_config.use_scaled_rope = static_cast<bool>(use_scaled_rope);
     llama3_config.norm_eps = norm_eps;
     llama3_config.max_gen_batch_size = max_gen_bs;
-    auto llama3 = std::make_shared<nn::TransformerModel>(
-        llama3_config,
-        nn::BuildTransformerSpec(llama3_config, nn::BuildFirstStageSpec(llama3_config),
-                                 nn::BuildTransformerLayerSpec(llama3_config), nn::BuildLastStageSpec(llama3_config)));
+    auto llama3 = std::make_shared<nn::TransformerModel>(llama3_config);
 
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
     int pp_size = nn::parallel::global::GetPipelineParallelSize();
 
@@ -5,7 +5,6 @@
 #include <vector>
 
 #include "infini_train/include/nn/modules/module.h"
-#include "infini_train/include/nn/modules/transformer/spec_utils.h"
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
 namespace infini_train::nn {
@@ -18,7 +17,7 @@ class CausalSelfAttention : public infini_train::nn::CloneableModule<CausalSelfA
 
     static constexpr char kParamBiasName[] = "bias";
 
-    explicit CausalSelfAttention(const TransformerConfig &config, const ModuleSpec &spec = {});
+    explicit CausalSelfAttention(const TransformerConfig &config);
 
     std::vector<std::shared_ptr<infini_train::Tensor>>
     Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) override;
 
@@ -3,7 +3,6 @@
 #include <vector>
 
 #include "infini_train/include/nn/modules/module.h"
-#include "infini_train/include/nn/modules/transformer/spec_utils.h"
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
 namespace infini_train::nn {
@@ -17,9 +16,12 @@ class MLP : public infini_train::nn::CloneableModule<MLP> {
     static constexpr char kCFc2LayerName[] = "c_fc2";
     static constexpr char kSiluLayerName[] = "silu";
 
-    explicit MLP(const TransformerConfig &config, const ModuleSpec &spec = {});
+    explicit MLP(const TransformerConfig &config);
 
     std::vector<std::shared_ptr<infini_train::Tensor>>
     Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) override;
+
+private:
+    int64_t hidden_dim_ = 0;
 };
 } // namespace infini_train::nn
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,6 @@`
`17`	`17`	`#include "infini_train/include/nn/modules/normalization.h"`
`18`	`18`	`#include "infini_train/include/nn/modules/sparse.h"`
`19`	`19`	`#include "infini_train/include/nn/modules/transformer/causal_self_attention.h"`
`20`		`-#include "infini_train/include/nn/modules/transformer/layer_specs.h"`
`21`	`20`	`#include "infini_train/include/nn/modules/transformer/mlp.h"`
`22`	`21`	`#include "infini_train/include/nn/modules/transformer/transformer.h"`
`23`	`22`	`#include "infini_train/include/nn/parallel/global.h"`
`@@ -96,10 +95,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`96`	`95`	`gpt2_config.n_layer = n_layer;`
`97`	`96`	`gpt2_config.n_head = n_head;`
`98`	`97`	`gpt2_config.n_embd = n_embd;`
`99`		`- auto local_gpt2 = std::make_shared<nn::TransformerModel>(`
`100`		`- gpt2_config,`
`101`		`- nn::BuildTransformerSpec(gpt2_config, nn::BuildFirstStageSpec(gpt2_config),`
`102`		`- nn::BuildTransformerLayerSpec(gpt2_config), nn::BuildLastStageSpec(gpt2_config)));`
	`98`	`+ auto local_gpt2 = std::make_shared<nn::TransformerModel>(gpt2_config);`
`103`	`99`
`104`	`100`	`LOG(INFO) << "magic: " << magic << " version: " << version << " block_size: " << block_size`
`105`	`101`	`<< " vocab_size: " << vocab_size << " n_layer: " << n_layer << " n_head: " << n_head`
`@@ -140,6 +136,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`140`	`136`
`141`	`137`	`auto state_dict = local_gpt2->StateDict();`
`142`	`138`
	`139`	`+ printf("===============Model Config:===============\n");`
`143`	`140`	`// transformer.wte.weight (also transformer.lm_head.weight)`
`144`	`141`	`// full: (model_vocab_size, n_embd)`
`145`	`142`	`// local: (vocab_size_per_partition, n_embd)`
`@@ -158,7 +155,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`158`	`155`	`size_t wte_bytes = model_vocab_size * n_embd * sizeof(float);`
`159`	`156`	`ifs.seekg(wte_bytes, std::ios::cur);`
`160`	`157`	`}`
`161`		`-`
	`158`	`+ printf("Loading wte.weight...\n");`
`162`	`159`	`if (tp_size == 1) {`
`163`	`160`	`// Skip padded vocab part when TP is not enabled`
`164`	`161`	`ifs.ignore((padded_vocab_size - model_vocab_size) * n_embd * sizeof(float));`
`@@ -174,7 +171,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`174`	`171`	`size_t wpe_bytes = block_size * n_embd * sizeof(float);`
`175`	`172`	`ifs.seekg(wpe_bytes, std::ios::cur);`
`176`	`173`	`}`
`177`		`-`
	`174`	`+ printf("Loading wpe.weight...\n");`
`178`	`175`	`// transformer.h.{i}.ln_1.weight`
`179`	`176`	`int local_layer_index = 0;`
`180`	`177`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -190,7 +187,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`190`	`187`	`ifs.seekg(ln_1_w_bytes, std::ios::cur);`
`191`	`188`	`}`
`192`	`189`	`}`
`193`		`-`
	`190`	`+ printf("Loading ln_1.weight...\n");`
`194`	`191`	`// transformer.h.{i}.ln_1.bias`
`195`	`192`	`local_layer_index = 0;`
`196`	`193`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -205,7 +202,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`205`	`202`	`ifs.seekg(ln_1_b_bytes, std::ios::cur);`
`206`	`203`	`}`
`207`	`204`	`}`
`208`		`-`
	`205`	`+ printf("Loading ln_1.bias...\n");`
`209`	`206`	`// transformer.h.{i}.attn.c_attn.weight (ColumnParallelLinear, but actually applies on "rows")`
`210`	`207`	`local_layer_index = 0;`
`211`	`208`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -248,7 +245,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`248`	`245`	`ifs.seekg(c_attn_w_bytes, std::ios::cur);`
`249`	`246`	`}`
`250`	`247`	`}`
`251`		`-`
	`248`	`+ printf("Loading c_attn.weight...\n");`
`252`	`249`	`// transformer.h.{i}.attn.c_attn.bias (ColumnParallelLinear)`
`253`	`250`	`local_layer_index = 0;`
`254`	`251`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -290,7 +287,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`290`	`287`	`ifs.seekg(c_attn_b_bytes, std::ios::cur);`
`291`	`288`	`}`
`292`	`289`	`}`
`293`		`-`
	`290`	`+ printf("Loading c_attn.bias...\n");`
`294`	`291`	`// transformer.h.{i}.attn.c_proj.weight (RowParallelLinear, but actually applies on "columns")`
`295`	`292`	`local_layer_index = 0;`
`296`	`293`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -307,7 +304,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`307`	`304`	`ifs.seekg(c_proj_w_bytes, std::ios::cur);`
`308`	`305`	`}`
`309`	`306`	`}`
`310`		`-`
	`307`	`+ printf("Loading c_proj.weight...\n");`
`311`	`308`	`// transformer.h.{i}.attn.c_proj.bias (RowParallelLinear, no shard on bias)`
`312`	`309`	`local_layer_index = 0;`
`313`	`310`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -323,7 +320,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`323`	`320`	`ifs.seekg(c_proj_b_bytes, std::ios::cur);`
`324`	`321`	`}`
`325`	`322`	`}`
`326`		`-`
	`323`	`+ printf("Loading ln_2.weight...\n");`
`327`	`324`	`// transformer.h.{i}.ln_2.weight`
`328`	`325`	`local_layer_index = 0;`
`329`	`326`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -339,7 +336,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`339`	`336`	`ifs.seekg(ln_2_w_bytes, std::ios::cur);`
`340`	`337`	`}`
`341`	`338`	`}`
`342`		`-`
	`339`	`+ printf("Loading ln_2.bias...\n");`
`343`	`340`	`// transformer.h.{i}.ln_2.bias`
`344`	`341`	`local_layer_index = 0;`
`345`	`342`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -354,7 +351,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`354`	`351`	`ifs.seekg(ln_2_b_bytes, std::ios::cur);`
`355`	`352`	`}`
`356`	`353`	`}`
`357`		`-`
	`354`	`+ printf("Loading mlp.c_fc.weight...\n");`
`358`	`355`	`// transformer.h.{i}.mlp.c_fc.weight (ColumnParallelLinear, but actually applies on "rows")`
`359`	`356`	`local_layer_index = 0;`
`360`	`357`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -370,7 +367,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`370`	`367`	`ifs.seekg(c_fc_w_bytes, std::ios::cur);`
`371`	`368`	`}`
`372`	`369`	`}`
`373`		`-`
	`370`	`+ printf("Loading mlp.c_fc.bias...\n");`
`374`	`371`	`// transformer.h.{i}.mlp.c_fc.bias (ColumnParallelLinear)`
`375`	`372`	`local_layer_index = 0;`
`376`	`373`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -386,7 +383,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`386`	`383`	`ifs.seekg(c_fc_b_bytes, std::ios::cur);`
`387`	`384`	`}`
`388`	`385`	`}`
`389`		`-`
	`386`	`+ printf("Loading mlp.c_proj.weight...\n");`
`390`	`387`	`// transformer.h.{i}.mlp.c_proj.weight (RowParallelLinear, but actually applies on "columns")`
`391`	`388`	`local_layer_index = 0;`
`392`	`389`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -403,7 +400,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`403`	`400`	`ifs.seekg(c_proj_w_bytes, std::ios::cur);`
`404`	`401`	`}`
`405`	`402`	`}`
`406`		`-`
	`403`	`+ printf("Loading mlp.c_proj.bias...\n");`
`407`	`404`	`// transformer.h.{i}.mlp.c_proj.bias (RowParallelLinear, no shard on bias)`
`408`	`405`	`local_layer_index = 0;`
`409`	`406`	`for (int idx = 0; idx < n_layer; ++idx) {`
`@@ -420,6 +417,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`420`	`417`	`}`
`421`	`418`	`}`
`422`	`419`
	`420`	`+ printf("Loading is_last_stage...\n");`
`423`	`421`	`if (is_last_stage) {`
`424`	`422`	`// transformer.ln_f.weight`
`425`	`423`	`auto &transformer_ln_f_weight`
`@@ -436,6 +434,8 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)`
`436`	`434`	`size_t ln_f_b_bytes = n_embd * sizeof(float);`
`437`	`435`	`ifs.seekg(ln_f_w_bytes + ln_f_b_bytes, std::ios::cur);`
`438`	`436`	`}`
	`437`	`+`
	`438`	`+ printf("Finished loading checkpoint from %s\n", filepath.c_str());`
`439`	`439`	`return local_gpt2;`
`440`	`440`	`}`
`441`	`441`	`} // namespace gpt2`