Merge pull request #187 from InfiniTensor/issue/186

whjthu · web-flow · commit de3e6b95a753 · 2026-01-16T12:17:54.000+08:00
issue/186 support longrope
diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp
@@ -7,6 +7,8 @@
 
 #include "../infinilm_model.hpp"
 
+#include <infinicore/nn/rope.hpp>
+
 namespace infinilm::models::llama {
 
 /**
@@ -20,41 +22,43 @@ struct LlamaConfig : public InfinilmModel::Config {
     infinicore::DataType dtype = infinicore::DataType::F32;
 
     // Vocabulary and embedding
-    size_t vocab_size = 32000;              // Vocabulary size
-    size_t hidden_size = 4096;               // Hidden dimension size
-    size_t intermediate_size = 11008;        // MLP intermediate dimension
+    size_t vocab_size = 32000;        // Vocabulary size
+    size_t hidden_size = 4096;        // Hidden dimension size
+    size_t intermediate_size = 11008; // MLP intermediate dimension
 
     // Architecture
-    size_t num_hidden_layers = 32;           // Number of decoder layers
-    size_t num_attention_heads = 32;         // Number of attention heads
-    size_t num_key_value_heads = 32;         // Number of key-value heads (for GQA)
-    size_t head_dim = 128;                   // Attention head dimension (hidden_size / num_attention_heads)
+    size_t num_hidden_layers = 32;   // Number of decoder layers
+    size_t num_attention_heads = 32; // Number of attention heads
+    size_t num_key_value_heads = 32; // Number of key-value heads (for GQA)
+    size_t head_dim = 128;           // Attention head dimension (hidden_size / num_attention_heads)
 
     // Position embeddings
-    size_t max_position_embeddings = 2048;   // Maximum sequence length
-    double rope_theta = 10000.0;             // RoPE base frequency
+    size_t max_position_embeddings = 2048; // Maximum sequence length
+    double rope_theta = 10000.0;           // RoPE base frequency
+
+    std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> rope_scaling = nullptr; // RoPE scaling type
 
     // Normalization
-    double rms_norm_eps = 1e-6;              // RMSNorm epsilon
+    double rms_norm_eps = 1e-6; // RMSNorm epsilon
 
     // Activation
-    std::string hidden_act = "silu";         // Activation function (typically "silu")
-    std::string model_type = "llama";        // Model type identifier (matches HF configs)
+    std::string hidden_act = "silu";  // Activation function (typically "silu")
+    std::string model_type = "llama"; // Model type identifier (matches HF configs)
 
     // Optional features
-    bool use_cache = true;                   // Whether to use KV cache
-    bool attention_bias = true;              // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
-    bool attention_output_bias = false;      // Whether to use bias in output projection (o_proj)
-    bool mlp_bias = false;                   // Whether to use bias in MLP projections
-    bool tie_word_embeddings = false;        // Whether to tie input/output embeddings
+    bool use_cache = true;              // Whether to use KV cache
+    bool attention_bias = true;         // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
+    bool attention_output_bias = false; // Whether to use bias in output projection (o_proj)
+    bool mlp_bias = false;              // Whether to use bias in MLP projections
+    bool tie_word_embeddings = false;   // Whether to tie input/output embeddings
 
     // Training/initialization parameters
-    double attention_dropout = 0.0;          // Dropout ratio for attention probabilities
-    double initializer_range = 0.02;         // Standard deviation for weight initialization
-    size_t pretraining_tp = 1;                // Tensor parallelism rank used during pretraining
+    double attention_dropout = 0.0;  // Dropout ratio for attention probabilities
+    double initializer_range = 0.02; // Standard deviation for weight initialization
+    size_t pretraining_tp = 1;       // Tensor parallelism rank used during pretraining
 
     // Model metadata
-    std::string name_or_path = "";           // Model name or path identifier
+    std::string name_or_path = ""; // Model name or path identifier
 
     // Token IDs
     int64_t pad_token_id = -1;               // Padding token ID (optional)
diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp
@@ -34,7 +34,7 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
     // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
     INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings,
                               config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX,
-                              dtype, device);
+                              dtype, device, config.rope_scaling);
 
     for (auto &layer : layers_) {
         if (layer) {
diff --git a/csrc/pybind11/models/llama.hpp b/csrc/pybind11/models/llama.hpp
@@ -6,6 +6,7 @@
 #include "../../models/llama/llama_attention.hpp"
 #include "infinicore/device.hpp"
 #include "infinicore/nn/module.hpp"
+#include "infinicore/nn/rope.hpp"
 #include "infinicore/tensor.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
@@ -69,7 +70,8 @@ inline void bind_llama(py::module &m) {
         .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
         .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
         .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
-        .def_property("bos_token_id", [](const LlamaConfig &self) {
+        .def_property(
+            "bos_token_id", [](const LlamaConfig &self) {
                 // Always return as list to match Python config format
                 return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) {
                 // Accept both single int and list
@@ -80,7 +82,8 @@ inline void bind_llama(py::module &m) {
                 } else {
                     throw py::type_error("bos_token_id must be int or list of ints");
                 } })
-        .def_property("eos_token_id", [](const LlamaConfig &self) {
+        .def_property(
+            "eos_token_id", [](const LlamaConfig &self) {
                 // Always return as list to match Python config format
                 return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) {
                 // Accept both single int and list
@@ -91,6 +94,86 @@ inline void bind_llama(py::module &m) {
                 } else {
                     throw py::type_error("eos_token_id must be int or list of ints");
                 } })
+        .def_property(
+            "rope_scaling",
+
+            // ---------- getter ----------
+            [](const LlamaConfig &self) -> py::object {
+                if (!self.rope_scaling) {
+                    return py::none();
+                }
+
+                using ScalingConfig = infinicore::nn::RoPE::ScalingConfig;
+                using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;
+
+                py::dict d;
+
+                if (auto *lr = dynamic_cast<const LongRopeConfig *>(self.rope_scaling.get())) {
+                    d["type"] = "longrope";
+                    d["rope_type"] = "longrope";
+                    d["factor"] = lr->factor();
+                    d["original_max_position_embeddings"] = lr->original_max_position_embeddings();
+                    d["short_factor"] = lr->short_factor();
+                    d["long_factor"] = lr->long_factor();
+                } else {
+                    throw std::runtime_error("Unknown RoPE scaling type");
+                }
+
+                return std::move(d);
+            },
+
+            // ---------- setter ----------
+            [](LlamaConfig &self, py::object value) {
+                if (value.is_none()) {
+                    self.rope_scaling.reset();
+                    return;
+                }
+
+                if (!py::isinstance<py::dict>(value)) {
+                    throw py::type_error("rope_scaling must be a dict or None");
+                }
+
+                py::dict d = value.cast<py::dict>();
+
+                auto get_str = [&](const char *k) {
+                    if (!d.contains(k)) {
+                        throw py::key_error(k);
+                    }
+                    return py::cast<std::string>(d[k]);
+                };
+
+                std::string type = d.contains("rope_type")
+                                     ? py::cast<std::string>(d["rope_type"])
+                                     : get_str("type");
+
+                if (type == "longrope") {
+                    using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;
+
+                    if (!d.contains("short_factor") || !d.contains("long_factor") || !d.contains("original_max_position_embeddings")) {
+                        throw py::value_error(
+                            "longrope requires short_factor, long_factor, "
+                            "original_max_position_embeddings");
+                    }
+
+                    std::vector<float> short_factor = py::cast<std::vector<float>>(d["short_factor"]);
+                    std::vector<float> long_factor = py::cast<std::vector<float>>(d["long_factor"]);
+
+                    size_t original_max_position_embeddings = py::cast<size_t>(d["original_max_position_embeddings"]);
+
+                    float factor = 1.0f;
+                    if (d.contains("factor")) {
+                        factor = py::cast<float>(d["factor"]);
+                    }
+
+                    self.rope_scaling = std::make_shared<LongRopeConfig>(
+                        std::move(short_factor),
+                        std::move(long_factor),
+                        original_max_position_embeddings,
+                        factor);
+                } else {
+                    throw py::value_error("Unsupported rope_scaling type: " + type);
+                }
+            })
         .def("validate", &LlamaConfig::validate)
         .def("kv_dim", &LlamaConfig::kv_dim)
         // Add __dir__ to make attributes discoverable via dir() in Python
@@ -108,6 +191,7 @@ inline void bind_llama(py::module &m) {
             dir_list.append("hidden_act");
             dir_list.append("model_type");
             dir_list.append("rope_theta");
+            dir_list.append("rope_scaling");
             dir_list.append("attention_bias");
             dir_list.append("attention_output_bias");
             dir_list.append("mlp_bias");
diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py
@@ -368,7 +368,7 @@ def render_ceval(_tokenizer, conversation):
 def render_mmlu(_tokenizer, question, choices):
     """Render MMLU question and choices to input content"""
     choices_text = "\n".join(
-        [f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)]
+        [f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)]
     )
     instruction = (
         "You are a multiple-choice question solver. "
@@ -924,7 +924,9 @@ def _load_mmlu_subject(subj):
                 splits_to_load = (
                     ["test"]
                     if split == "test"
-                    else ["validation"] if split == "val" else ["validation", "test"]
+                    else ["validation"]
+                    if split == "val"
+                    else ["validation", "test"]
                 )
                 # Load each subject individually from hardcoded list, excluding "all"
                 for subject_name in mmlu_subjects:
@@ -946,7 +948,9 @@ def _load_mmlu_subject(subj):
                 splits_to_load = (
                     ["test"]
                     if split == "test"
-                    else ["validation"] if split == "val" else ["validation", "test"]
+                    else ["validation"]
+                    if split == "val"
+                    else ["validation", "test"]
                 )
                 records = []
                 for sp in splits_to_load:
@@ -980,14 +984,13 @@ def load_subject_samples(subj_name):
     all_results = []
 
     for subj in subject_list:
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(f"Evaluating subject: {subj}")
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         try:
             samples, actual_subj_name = load_subject_samples(subj)
             print(f"Loaded {len(samples)} samples for subject: {actual_subj_name}")
-
             # Limit number of samples if specified
             if num_samples is not None and num_samples > 0:
                 original_count = len(samples)
@@ -996,37 +999,9 @@ def load_subject_samples(subj_name):
                     f"Limited to {len(samples)} samples for validation (from {original_count} total)"
                 )
 
-            # Test with first sample if available
-            if len(samples) > 0:
-                sample = samples[0]
-                if benchmark == "ceval":
-                    input_content = f"'question':{sample['question']},'A': {sample['A']}, 'B':{sample['B']}, 'C': {sample['C']},'D': {sample['D']}。"
-                    test_conversation = [
-                        {
-                            "role": "system",
-                            "content": "请从question的A，B，C，D四个选项中选择正确的选项。例如，标准答案：A。",
-                        },
-                        {"role": "user", "content": input_content},
-                    ]
-                    test_output = model.generate(
-                        test_conversation,
-                        max_steps=max_new_tokens,
-                        topp_=1.0,
-                        topk_=1,
-                        temperature_=1.0,
-                    )
-                elif benchmark == "mmlu":
-                    question = sample["question"]
-                    choices = sample["choices"]
-                    test_output = model.generate(
-                        question,
-                        choices,
-                        max_steps=max_new_tokens,
-                        topp_=1.0,
-                        topk_=1,
-                        temperature_=1.0,
-                    )
-                print(f"\nTest output: {test_output}\n")
+            if len(samples) == 0:
+                print(f"No samples found for subject: {actual_subj_name}")
+                continue
 
             # Evaluate samples for this subject
             result = evaluate_samples(
@@ -1044,13 +1019,22 @@ def load_subject_samples(subj_name):
     model.destroy_model_instance()
 
     # Calculate overall results
+    print(f"\n{'=' * 60}")
+    print("OVERALL RESULTS")
+    print(f"{'=' * 60}")
+    if len(all_results) == 0:
+        print("No tests were run.")
+        return
+    elif len(all_results) > 1:
+        for r in all_results:
+            print(
+                f"Subject '{r['subject']}': {r['correct']}/{r['total']} = {r['accuracy']:.2%}"
+            )
     overall_correct = sum(r["correct"] for r in all_results)
     overall_total = sum(r["total"] for r in all_results)
     overall_accuracy = overall_correct / overall_total if overall_total > 0 else 0.0
 
-    print(f"\n{'='*60}")
-    print("OVERALL RESULTS")
-    print(f"{'='*60}")
+    print(f"{'=' * 60}")
     if benchmark == "ceval":
         print(
             f"Overall 成绩: {overall_correct}/{overall_total} = {overall_accuracy:.2%}"
@@ -1062,7 +1046,7 @@ def load_subject_samples(subj_name):
 
     print(f"Total Latency: {TOTAL_TIME} seconds")
     print(f"Total Tokens Processed: {TOTAL_TOKENS} tokens")
-    print(f"Overall Throughput: {TOTAL_TOKENS/TOTAL_TIME:.2f} tokens/s")
+    print(f"Overall Throughput: {TOTAL_TOKENS / TOTAL_TIME:.2f} tokens/s")
 
     # Write CSV if output path is specified
     if output_csv: