Bug fix on bitnet models quantizations

MohsenDehghankar · MohsenDehghankar · commit b6e6549ecb78 · 2026-03-28T10:58:54.000-07:00
diff --git a/README.md b/README.md
@@ -32,11 +32,14 @@ Inference on CPU for a 1.58-bit LLM decoding step. Click the image to view the o
 ```bash
 git clone https://github.com/UIC-InDeXLab/RSR-Core.git
 cd RSR-Core
-pip install -e .
+pip install -e . --no-build-isolation
 ```
 
 #### Building the kernels
 
+Both CPU and CUDA kernels are automatically built during `pip install -e . --no-build-isolation`.
+You can also build them manually:
+
 **CPU kernels** — Compile the C shared libraries via the provided Makefiles.
 Requires `gcc` with AVX2 and OpenMP support.
 
@@ -45,8 +48,9 @@ make -C kernels/bit_1/cpu
 make -C kernels/bit_1_58/cpu
 ```
 
-**CUDA kernels** — No manual build step needed. CUDA kernels are JIT-compiled
-by PyTorch on first use (`torch.utils.cpp_extension`). Requirements:
+**CUDA kernels** — Pre-built during install if a GPU is available. If not,
+they are JIT-compiled by PyTorch on first use (`torch.utils.cpp_extension`).
+Requirements:
 - CUDA toolkit (matching your PyTorch build)
 - `ninja` (`pip install ninja`)
 
diff --git a/integrations/hf/model_infer.py b/integrations/hf/model_infer.py
@@ -582,10 +582,37 @@ def load_hf_model(
 
     # The @torch.compile-decorated unpack_weights in transformers' BitNet
     # integration fails on CPU with dynamo.  Force eager execution.
+    _prev_suppress = torch._dynamo.config.suppress_errors
     torch._dynamo.config.suppress_errors = True
 
     model = AutoModelForCausalLM.from_pretrained(model_source, **load_kwargs)
 
+    torch._dynamo.config.suppress_errors = _prev_suppress
+
+    # Work around a bug in transformers' BitNetDeserialize.convert: it unpacks
+    # ternary weights with dtype=uint8 (the storage dtype) instead of the
+    # model's compute dtype, so -1 wraps to 255 and F.linear gets a dtype
+    # mismatch.  Only apply to BitNet models (detected via quantization_config).
+    # Fix by reinterpreting uint8 as int8 then casting to the model's dtype.
+    _is_bitnet = getattr(model.config, "quantization_config", None) is not None and (
+        getattr(model.config.quantization_config, "quant_method", None) == "bitnet"
+        or (isinstance(model.config.quantization_config, dict)
+            and model.config.quantization_config.get("quant_method") == "bitnet")
+    )
+    if _is_bitnet:
+        # Determine the correct target dtype: use the explicitly requested dtype,
+        # otherwise infer from the non-quantized parameters already in the model.
+        if dtype:
+            _target_dtype = getattr(torch, dtype)
+        else:
+            _non_uint8 = [
+                p.dtype for p in model.parameters() if p.dtype != torch.uint8
+            ]
+            _target_dtype = _non_uint8[0] if _non_uint8 else torch.bfloat16
+        for _name, param in model.named_parameters():
+            if param.dtype == torch.uint8:
+                param.data = param.data.view(torch.int8).to(_target_dtype)
+
     # bitsandbytes models are already placed by device_map; skip .to()
     if quantize not in ("8bit", "4bit"):
         model = model.to(device)
diff --git a/setup.py b/setup.py
@@ -1,34 +1,142 @@
-"""Custom build: compile CPU kernels (make) during pip install."""
+"""Custom build: compile CPU and CUDA kernels during pip install."""
 
 import subprocess
+import sys
 import os
 from setuptools import setup
 from setuptools.command.build_py import build_py
 from setuptools.command.develop import develop
 
 ROOT = os.path.dirname(os.path.abspath(__file__))
 
-KERNEL_DIRS = [
+CPU_KERNEL_DIRS = [
     os.path.join(ROOT, "kernels", "bit_1", "cpu"),
     os.path.join(ROOT, "kernels", "bit_1_58", "cpu"),
 ]
 
+CUDA_KERNEL_DIR_BIT1 = os.path.join(ROOT, "kernels", "bit_1", "cuda")
+CUDA_KERNEL_DIR_BIT158 = os.path.join(ROOT, "kernels", "bit_1_58", "cuda")
 
-def _build_kernels():
-    for d in KERNEL_DIRS:
+
+def _build_cpu_kernels():
+    for d in CPU_KERNEL_DIRS:
         if os.path.isdir(d) and os.path.isfile(os.path.join(d, "Makefile")):
             subprocess.check_call(["make", "-C", d])
 
 
+def _print_cuda_skip_warning():
+    """Print a warning that CUDA kernels were not pre-built."""
+    BOLD_RED = "\033[1;31m"
+    RESET = "\033[0m"
+    YELLOW = "\033[33m"
+    print()
+    print(f"{YELLOW}setup.py: CUDA not available — CUDA kernels were not pre-built.{RESET}")
+    print(f"{YELLOW}          They will be JIT-compiled on the first CUDA run, if available.{RESET}")
+    print()
+    print(f"  {BOLD_RED}FOR BENCHMARKS PAY ATTENTION TO FIRST BUILD TIME{RESET}")
+    print()
+
+
+def _build_cuda_kernels():
+    """JIT-compile all CUDA kernels so first run has zero compilation delay."""
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            _print_cuda_skip_warning()
+            return
+    except ImportError:
+        _print_cuda_skip_warning()
+        return
+
+    from torch.utils.cpp_extension import load
+
+    major, minor = torch.cuda.get_device_capability()
+    os.environ["TORCH_CUDA_ARCH_LIST"] = f"{major}.{minor}"
+
+    # Ensure ninja is on PATH
+    bindir = os.path.dirname(sys.executable)
+    path_entries = os.environ.get("PATH", "").split(os.pathsep)
+    if bindir and bindir not in path_entries:
+        os.environ["PATH"] = os.pathsep.join([bindir, *path_entries])
+
+    # -- bit_1 CUDA kernels (torch JIT) --
+    bit1_kernels = [
+        ("rsr_cuda_v5_9", "rsr_v5_9.cu"),
+        ("rsr_cuda_v5_8", "rsr_v5_8.cu"),
+        ("rsr_cuda_v5_6", "rsr_v5_6.cu"),
+        ("rsr_cuda_v4_10", "rsr_v4_10.cu"),
+    ]
+    for name, source in bit1_kernels:
+        source_path = os.path.join(CUDA_KERNEL_DIR_BIT1, source)
+        if not os.path.isfile(source_path):
+            continue
+        print(f"setup.py: JIT compiling {name} ...")
+        try:
+            load(
+                name=name,
+                sources=[source_path],
+                extra_cuda_cflags=["-O3", "--use_fast_math"],
+                verbose=False,
+            )
+        except Exception as e:
+            print(f"setup.py: WARNING: failed to compile {name}: {e}")
+
+    # -- bit_1_58 CUDA kernels (torch JIT) --
+    bit158_jit_kernels = [
+        ("rsr_ternary_cuda_v2_0", "rsr_ternary_v2_0.cu"),
+    ]
+    for name, source in bit158_jit_kernels:
+        source_path = os.path.join(CUDA_KERNEL_DIR_BIT158, source)
+        if not os.path.isfile(source_path):
+            continue
+        print(f"setup.py: JIT compiling {name} ...")
+        try:
+            load(
+                name=name,
+                sources=[source_path],
+                extra_cuda_cflags=["-O3", "--use_fast_math"],
+                verbose=False,
+            )
+        except Exception as e:
+            print(f"setup.py: WARNING: failed to compile {name}: {e}")
+
+    # -- bit_1_58 BitNet kernel (nvcc direct) --
+    bitnet_source = os.path.join(CUDA_KERNEL_DIR_BIT158, "bitnet_kernels.cu")
+    bitnet_lib = os.path.join(CUDA_KERNEL_DIR_BIT158, "libbitnet.so")
+    if os.path.isfile(bitnet_source) and not os.path.isfile(bitnet_lib):
+        cuda_home = os.environ.get("CUDA_HOME", "/usr/local/cuda")
+        nvcc = os.path.join(cuda_home, "bin", "nvcc")
+        if os.path.isfile(nvcc):
+            arch = f"{major}{minor}"
+            cmd = [
+                nvcc, "-std=c++17", "--shared", "--compiler-options", "-fPIC",
+                "-O3", "--use_fast_math", "-lineinfo",
+                f"-gencode=arch=compute_{arch},code=sm_{arch}",
+                f"-gencode=arch=compute_{arch},code=compute_{arch}",
+                bitnet_source, "-o", bitnet_lib,
+            ]
+            print(f"setup.py: compiling libbitnet.so ...")
+            try:
+                subprocess.run(cmd, cwd=CUDA_KERNEL_DIR_BIT158, check=True,
+                               capture_output=True, text=True)
+            except Exception as e:
+                print(f"setup.py: WARNING: failed to compile libbitnet.so: {e}")
+
+
+def _build_all_kernels():
+    _build_cpu_kernels()
+    _build_cuda_kernels()
+
+
 class BuildPyWithKernels(build_py):
     def run(self):
-        _build_kernels()
+        _build_all_kernels()
         super().run()
 
 
 class DevelopWithKernels(develop):
     def run(self):
-        _build_kernels()
+        _build_all_kernels()
         super().run()
 
 
diff --git a/ui/frontend/src/pages/DashboardPages.jsx b/ui/frontend/src/pages/DashboardPages.jsx
@@ -0,0 +1,114 @@
+import { useEffect, useState } from "react";
+import { Link } from "react-router-dom";
+import { listModels, listMultipliers, getSystemInfo } from "../api";
+import Card from "../components/Card";
+
+function StatCard({ label, value, sub, to, loading }) {
+  const inner = (
+    <div className="text-center">
+      {loading ? (
+        <div className="flex justify-center">
+          <div className="h-8 w-16 rounded bg-gray-700 animate-pulse" />
+        </div>
+      ) : (
+        <p className="text-3xl font-bold text-cyan-400">{value}</p>
+      )}
+      <p className="text-sm text-gray-400 mt-1">{label}</p>
+      {loading ? (
+        <div className="flex justify-center mt-0.5">
+          <div className="h-3 w-24 rounded bg-gray-700 animate-pulse" />
+        </div>
+      ) : (
+        sub && <p className="text-xs text-gray-600 mt-0.5">{sub}</p>
+      )}
+    </div>
+  );
+  if (to) return <Link to={to} className="block hover:scale-105 transition-transform">{inner}</Link>;
+  return inner;
+}
+
+export default function DashboardPage() {
+  const [models, setModels] = useState([]);
+  const [multipliers, setMultipliers] = useState([]);
+  const [sys, setSys] = useState(null);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    Promise.all([
+      listModels().then(setModels).catch(() => {}),
+      listMultipliers().then(setMultipliers).catch(() => {}),
+      getSystemInfo().then(setSys).catch(() => {}),
+    ]).finally(() => setLoading(false));
+  }, []);
+
+  const cpuModels = models.filter((m) => m.device === "cpu").length;
+  const cudaModels = models.filter((m) => m.device === "cuda").length;
+  const totalSize = models.reduce((s, m) => s + m.size_mb, 0);
+
+  return (
+    <div className="max-w-5xl mx-auto space-y-6">
+      <div>
+        <h1 className="text-2xl font-bold text-white">Dashboard</h1>
+        <p className="text-gray-500 text-sm mt-1">RSR-core project overview</p>
+      </div>
+
+      <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
+        <Card><StatCard label="Preprocessed Models" value={models.length} sub={`${cpuModels} CPU / ${cudaModels} CUDA`} to="/models" loading={loading} /></Card>
+        <Card><StatCard label="Multipliers" value={multipliers.length} to="/multipliers" loading={loading} /></Card>
+        <Card><StatCard label="Total Size" value={`${(totalSize / 1024).toFixed(1)} GB`} sub="preprocessed data" loading={loading} /></Card>
+        <Card><StatCard label="CUDA" value={sys?.cuda_available ? "Available" : "N/A"} sub={sys?.cuda_device || "CPU only"} loading={loading} /></Card>
+      </div>
+
+      <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
+        <Card title="Quick Actions">
+          <div className="space-y-2">
+            <Link to="/preprocess" className="block w-full text-left px-4 py-3 rounded-lg bg-gray-800 hover:bg-gray-750 hover:bg-cyan-500/5 border border-gray-700 transition-colors">
+              <span className="text-sm font-medium text-gray-200">Preprocess a Model</span>
+              <span className="block text-xs text-gray-500 mt-0.5">Search HuggingFace and apply RSR preprocessing</span>
+            </Link>
+            <Link to="/inference" className="block w-full text-left px-4 py-3 rounded-lg bg-gray-800 hover:bg-cyan-500/5 border border-gray-700 transition-colors">
+              <span className="text-sm font-medium text-gray-200">Run Inference</span>
+              <span className="block text-xs text-gray-500 mt-0.5">Generate text with RSR-accelerated models</span>
+            </Link>
+            <Link to="/benchmarks" className="block w-full text-left px-4 py-3 rounded-lg bg-gray-800 hover:bg-cyan-500/5 border border-gray-700 transition-colors">
+              <span className="text-sm font-medium text-gray-200">View Benchmarks</span>
+              <span className="block text-xs text-gray-500 mt-0.5">Compare RSR performance vs baselines</span>
+            </Link>
+          </div>
+        </Card>
+
+        <Card title="Preprocessed Models">
+          {loading ? (
+            <div className="space-y-2">
+              {[...Array(3)].map((_, i) => (
+                <div key={i} className="flex items-center justify-between px-3 py-2 bg-gray-800 rounded-lg">
+                  <div className="space-y-1.5">
+                    <div className="h-4 w-32 rounded bg-gray-700 animate-pulse" />
+                    <div className="h-3 w-20 rounded bg-gray-700 animate-pulse" />
+                  </div>
+                  <div className="h-5 w-10 rounded bg-gray-700 animate-pulse" />
+                </div>
+              ))}
+            </div>
+          ) : models.length === 0 ? (
+            <p className="text-gray-500 text-sm">No preprocessed models yet. <Link to="/preprocess" className="text-cyan-400 hover:underline">Preprocess one</Link>.</p>
+          ) : (
+            <div className="space-y-2 max-h-64 overflow-auto">
+              {models.map((m) => (
+                <div key={m.name} className="flex items-center justify-between px-3 py-2 bg-gray-800 rounded-lg">
+                  <div>
+                    <p className="text-sm text-gray-200">{m.name}</p>
+                    <p className="text-xs text-gray-500">{m.num_layers} layers, k={m.k}</p>
+                  </div>
+                  <span className={`text-xs px-2 py-0.5 rounded ${m.device === "cuda" ? "bg-green-500/10 text-green-400" : "bg-blue-500/10 text-blue-400"}`}>
+                    {m.device}
+                  </span>
+                </div>
+              ))}
+            </div>
+          )}
+        </Card>
+      </div>
+    </div>
+  );
+}