feat: Introduce benchmarking plan, performance modernization design, and a register-based VM prototype.

ProgrammerKR · ProgrammerKR · commit b6f9309ef2b0 · 2025-12-14T15:12:16.000+05:30
diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
@@ -0,0 +1,65 @@
+# ProXPL Benchmarking Plan
+
+## 1. Objectives
+- Measure baseline performance of the current interpreter.
+- Track improvements with the new Register VM and JIT.
+- Compare against CPython 3.11, Lua 5.4, and Node.js.
+
+## 2. Benchmark Suite Strategy
+
+### 2.1 Microbenchmarks (CPU Core)
+Focus on specific VM optimizations (dispatch, arithmetic, calls).
+
+| Benchmark | Description | Target Speedup (vs Current) |
+|-----------|-------------|----------------------------|
+| `fib.prox` | Recursive Fibonacci (Call overhead) | 3x |
+| `loop_sum.prox` | Tight loop addition (1M iters) | 5x |
+| `array_access.prox` | Read/Write array elements | 2x |
+| `dict_get.prox` | Dictionary lookups (String keys) | 2x |
+
+### 2.2 Macrobenchmarks (Real Workload)
+| Benchmark | Description | Target Speedup |
+|-----------|-------------|----------------|
+| `json_bench.prox` | Parse/Serialize simulated JSON | 3x |
+| `http_sim.prox` | Simulated request routing/handling | 2x |
+| `nbody.prox` | Physics simulation (Float math) | 10x (with JIT) |
+
+## 3. Tools & Methodology
+
+We will use `hyperfine` for statistical execution time measurement.
+
+### Pre-requisites
+- `hyperfine` (install via `cargo install hyperfine` or `apt-get install hyperfine`)
+- `python3` (CPython 3.11+)
+- `lua` (Lua 5.4)
+- `node` (Node.js 20+)
+
+### Execution Commands
+
+Run the following command from the repository root:
+
+```bash
+# Example: Running the Fibonacci Benchmark
+hyperfine --warmup 3 \
+  "bin/proxpl run benchmarks/fib.prox" \
+  "python3 benchmarks/reference/fib.py" \
+  "lua benchmarks/reference/fib.lua" \
+  --export-markdown benchmarks/results/fib_results.md
+```
+
+## 4. Directory Structure
+
+```
+benchmarks/
+├── micro/
+│   ├── fib.prox
+│   ├── loop_sum.prox
+│   └── ...
+├── macro/
+│   └── nbody.prox
+├── reference/  <-- Equivalents in Py/Lua/Node
+│   ├── fib.py
+│   ├── fib.lua
+│   └── ...
+└── run_all.sh
+```
diff --git a/docs/design.md b/docs/design.md
@@ -0,0 +1,89 @@
+# ProXPL Performance Modernization Design Doc
+
+## 1. System Architecture (Target State)
+
+```text
+       +------------------+
+       |   Source Code    | (.prox)
+       +--------+---------+
+                |
+       +--------v---------+
+       |   Lexer/Parser   | (C / Modernized)
+       +--------+---------+
+                |
+       +--------v---------+
+       |    AST Builder   |
+       +--------+---------+
+                |
+       +--------v---------+
+       |  Bytecode Compiler| (AST -> REG-ISA)
+       | (Reg Allocator)  |
+       +--------+---------+
+                |
+       +--------v---------+
+       |   Bytecode Module|<-----+  Native Runtime   |
+       |  (Instructions)  |      | (C/Rust Strings,  |
+       +--------+---------+      |  Arrays, Dicts)   |
+                |                +-------------------+
+    +-----------v-----------+
+    |     Register VM       | <--- Profiling Events
+    | (Interpreter Loop)    |
+    +-----------+-----------+
+                |
+          [Hot Path?]
+                |
+    +-----------v-----------+
+    |   Baseline JIT (C)    | (Template/Copying JIT)
+    |  (Machine Code Gen)   |
+    +-----------+-----------+
+                |
+          [Very Hot?]
+                |
+    +-----------v-----------+     +------------------+
+    |   Optimizer (LLVM)    |     |  Inline Caches   |
+    | (Type Specialization) |<----+ (Polymorphic IC) |
+    +-----------------------+     +------------------+
+```
+
+## 2. Design Doc Outline & Modules
+
+### 2.1 Register-Based VM ISA
+**Motivation**: Reduce dispatch overhead (fewer instructions than stack VM) and improve cache locality.
+**Structure**:
+- `Instruction`: 32-bit word.
+- `Opcode`: 8 bits.
+- `A` (Dest): 8 bits.
+- `B` (Src1): 8 bits.
+- `C` (Src2/Imm): 8 bits.
+
+**Core Instructions**:
+- `MOV R_dest, R_src`
+- `ADD R_dest, R_src1, R_src2`
+- `LOADK R_dest, K_idx`
+- `CALL R_dest, R_func, NumArgs`
+- `RET R_src`
+
+### 2.2 Baseline JIT (Template JIT)
+**Strategy**:
+- Pre-compile machine code snippets for each opcode (templates).
+- **Benefit**: Very fast implementation, 2-5x speedup over interpreter.
+
+### 2.3 Optimizing JIT (LLVM / DynASM)
+**Strategy**:
+- Triggered for hot loops (>10k executions).
+- **Type Specialization**: Guard checks for types.
+
+### 2.4 Data Model & Memory Layout
+- **Value**: NaN-boxing (64-bit).
+- **GC**: Generational Mark-and-Sweep.
+
+## 3. Risks & Tradeoffs
+1.  **Complexity**: LLVM is heavy. **Mitigation**: Start with Template JIT.
+2.  **GC Pauses**: Generational GC adds complexity. **Mitigation**: Incremental marking.
+
+## 4. Benchmark Plan
+
+**Microbenchmarks**:
+1.  `arith_loop.prox`: Tight loop summing integers.
+2.  `call_depth.prox`: Recursive fibonacci.
+3.  `str_cat.prox`: String concatenation.
diff --git a/src/protos/vm_register.c b/src/protos/vm_register.c
@@ -0,0 +1,139 @@
+/*
+  Prototype Register-Based VM for ProXPL
+  --------------------------------------
+  Structure: 32-bit instructions (Opcode:8, A:8, B:8, C:8)
+*/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+// --- Instruction Format ---
+// | Opcode (8) | A (8) | B (8) | C (8) |
+// A: Destination Register
+// B: Source Register 1
+// C: Source Register 2 / Immediate
+
+typedef uint32_t Instruction;
+
+#define OP_MASK 0xFF
+#define REG_MASK 0xFF
+
+#define GET_OP(i)   ((i) & OP_MASK)
+#define GET_A(i)    (((i) >> 8) & REG_MASK)
+#define GET_B(i)    (((i) >> 16) & REG_MASK)
+#define GET_C(i)    (((i) >> 24) & REG_MASK)
+
+#define MK_INS(op, a, b, c) \
+    ((op) | ((a) << 8) | ((b) << 16) | ((c) << 24))
+
+// --- Opcodes ---
+enum OpCode {
+    OP_HALT = 0,
+    OP_LOADK, // R[A] = Consts[B]
+    OP_MOV,   // R[A] = R[B]
+    OP_ADD,   // R[A] = R[B] + R[C]
+    OP_SUB,
+    OP_PRINT  // print R[A]
+};
+
+// --- VM State ---
+#define MAX_REGS 256
+#define MAX_CONSTS 256
+
+typedef struct {
+    double numbers[MAX_CONSTS];
+} ConstTable;
+
+typedef struct {
+    Instruction* code;
+    size_t count;
+    ConstTable* consts;
+} ProtoChunk;
+
+typedef struct {
+    double registers[MAX_REGS]; // Simplified Value type for prototype
+    Instruction* ip;
+} RegisterVM;
+
+// --- Interpreter Loop ---
+void run_register_vm(RegisterVM* vm, ProtoChunk* chunk) {
+    vm->ip = chunk->code;
+    
+    printf("Starting Register VM execution...\n");
+
+    for (;;) {
+        Instruction ins = *vm->ip++;
+        uint8_t op = GET_OP(ins);
+        
+        // Computed goto would go here in production
+        switch (op) {
+            case OP_HALT:
+                printf("HALT encountered.\n");
+                return;
+
+            case OP_LOADK: {
+                uint8_t target = GET_A(ins);
+                uint8_t k_idx = GET_B(ins);
+                vm->registers[target] = chunk->consts->numbers[k_idx];
+                // printf("LOADK R[%d] = %f\n", target, vm->registers[target]);
+                break;
+            }
+
+            case OP_MOV: {
+                uint8_t dest = GET_A(ins);
+                uint8_t src = GET_B(ins);
+                vm->registers[dest] = vm->registers[src];
+                break;
+            }
+
+            case OP_ADD: {
+                uint8_t dest = GET_A(ins);
+                uint8_t src1 = GET_B(ins);
+                uint8_t src2 = GET_C(ins);
+                // Type checking would happen here in full VM
+                vm->registers[dest] = vm->registers[src1] + vm->registers[src2];
+                // printf("ADD R[%d] = %f + %f = %f\n", dest, vm->registers[src1], vm->registers[src2], vm->registers[dest]);
+                break;
+            }
+
+            case OP_PRINT: {
+                uint8_t src = GET_A(ins);
+                printf("OUT: %f\n", vm->registers[src]);
+                break;
+            }
+
+            default:
+                printf("Unknown Opcode: %d\n", op);
+                return;
+        }
+    }
+}
+
+// --- Test Driver ---
+int main() {
+    // Defines a simple program:
+    // val1 = 10.5
+    // val2 = 20.5
+    // result = val1 + val2
+    // print result
+    
+    Instruction code[] = {
+        MK_INS(OP_LOADK, 0, 0, 0), // R0 = Const[0] (10.5)
+        MK_INS(OP_LOADK, 1, 1, 0), // R1 = Const[1] (20.5)
+        MK_INS(OP_ADD, 2, 0, 1),   // R2 = R0 + R1
+        MK_INS(OP_PRINT, 2, 0, 0), // PRINT R2
+        MK_INS(OP_HALT, 0, 0, 0)
+    };
+
+    ConstTable constants;
+    constants.numbers[0] = 10.5;
+    constants.numbers[1] = 20.5;
+
+    ProtoChunk chunk = { .code = code, .count = 5, .consts = &constants };
+    RegisterVM vm;
+
+    run_register_vm(&vm, &chunk);
+
+    return 0;
+}