Skip to content

Commit 7a7bed3

Browse files
Add benchmark results for mat-mult
1 parent 61a3187 commit 7a7bed3

26 files changed

Lines changed: 1320 additions & 2079 deletions

.gitignore

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,14 @@ env/
6565

6666
.key
6767

68-
benchmarking/**/reports/
68+
benchmarking/**/reports/**
69+
!benchmarking/**/reports/
70+
!benchmarking/**/reports/**/
71+
!benchmarking/**/reports/**/*.json
6972

7073
out.log
7174

7275
integrations/**/*.json
7376
integrations/**/*.safetensors
7477

75-
.claude
78+
.claude

README.md

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,56 @@ Reference: [UIC-InDeXLab/RSR](https://github.com/UIC-InDeXLab/RSR)
88

99
```
1010
RSR-core/
11-
├── poc/ # Python proof-of-concept implementation
12-
├── kernels/
13-
│ ├── cpu/ # CPU kernels (C/C++)
14-
│ └── cuda/ # CUDA GPU kernels
15-
├── tests/ # Unit and integration tests
16-
└── benchmarks/ # Performance benchmarks
11+
├── multiplier/ # Python wrappers for kernels
12+
│ ├── bit_1/ # 1-bit (binary) multipliers (CPU/CUDA)
13+
│ └── bit_1_58/ # 1.58-bit (ternary) multipliers (CPU/CUDA)
14+
├── kernels/ # Low-level C/CUDA kernel source
15+
│ ├── bit_1/
16+
│ │ ├── cpu/ # C kernels
17+
│ │ └── cuda/ # CUDA kernels (.cu)
18+
│ └── bit_1_58/
19+
│ ├── cpu/ # C kernels
20+
│ └── cuda/ # CUDA kernels (.cu)
21+
├── integrations/ # Model integrations
22+
│ └── hf/ # HuggingFace integration
23+
├── benchmarking/ # Benchmarking scripts & results
24+
└── tests/ # Unit and integration tests
1725
```
1826

1927
## Benchmark Results
2028

2129
### Matrix-Vector Multiplication
2230

23-
#### CPU:
31+
#### CPU 🖥️
32+
33+
| 1-bit | 1.58-bit |
34+
|:---:|:---:|
35+
| ![1-bit CPU](assets/cpu_bit_1.png) | ![1.58-bit CPU](assets/cpu_bit_1_58.png) |
36+
37+
#### CUDA ⚡
2438

25-
#### CUDA:
39+
| 1-bit | 1.58-bit |
40+
|:---:|:---:|
41+
| ![1-bit CUDA](assets/cuda_bit_1.png) | ![1.58-bit CUDA](assets/cuda_bit_1_58.png) |
2642

2743
### Ternary (1.58bit) LLMs
2844

29-
Speedup is computed from `Avg Time` against the `HF bfloat16` baseline for the same model.
45+
Speedup is computed against the HuggingFace `bfloat16` baseline for the same model.
3046

3147
#### CPU 🖥️
32-
| Model | HF Time | RSR (ours) Time | HF Tok/s | RSR (ours) Tok/s | Speedup vs HF |
33-
| --- | ---: | ---: | ---: | ---: | ---: |
34-
| Falcon3-10B-Instruct-1.58bit | 351.215s | **5.663s** | 0.2 | **11.3** | **62.0x** |
35-
| Llama3-8B-1.58-100B-tokens | 261.557s | **4.862s** | 0.2 | **13.4** | **53.8x** |
36-
| bitnet-b1.58-2B-4T-bf16 | 31.446s | **2.258s** | 2.1 | **28.8** | **13.9x** |
37-
| bitnet-b1.58-2B-4T | 4.582s | **2.221s** | 14.2 | **29.3** | **2.1x** |
48+
49+
| Model | HF Tok/s | RSR Tok/s | Speedup |
50+
| :--- | ---: | ---: | ---: |
51+
| Falcon3-10B-Instruct-1.58bit | 0.2 | **11.3** | **62.0x** |
52+
| Llama3-8B-1.58-100B-tokens | 0.2 | **13.4** | **53.8x** |
53+
| bitnet-b1.58-2B-4T-bf16 | 2.1 | **28.8** | **13.9x** |
54+
| bitnet-b1.58-2B-4T | 14.2 | **29.3** | **2.1x** |
3855

3956
#### CUDA ⚡
40-
| Model | HF Time | RSR (ours) Time | HF Tok/s | RSR (ours) Tok/s | Speedup vs HF |
41-
| --- | ---: | ---: | ---: | ---: | ---: |
42-
| Falcon3-10B-Instruct-1.58bit | 2.536s | **1.351s** | 25.2 | **47.4** | **1.9x** |
43-
| Llama3-8B-1.58-100B-tokens | 2.035s | **1.097s** | 31.9 | **59.3** | **1.9x** |
44-
| bitnet-b1.58-2B-4T-bf16 | 1.966s | **1.133s** | 33.1 | **57.4** | **1.7x** |
45-
| bitnet-b1.58-2B-4T | 1.563s | **1.139s** | 41.6 | **57.1** | **1.4x** |
57+
58+
| Model | HF Tok/s | RSR Tok/s | Speedup |
59+
| :--- | ---: | ---: | ---: |
60+
| Falcon3-10B-Instruct-1.58bit | 25.2 | **47.4** | **1.9x** |
61+
| Llama3-8B-1.58-100B-tokens | 31.9 | **59.3** | **1.9x** |
62+
| bitnet-b1.58-2B-4T-bf16 | 33.1 | **57.4** | **1.7x** |
63+
| bitnet-b1.58-2B-4T | 41.6 | **57.1** | **1.4x** |

assets/cpu_bit_1.png

71.8 KB
Loading

assets/cpu_bit_1_58.png

65.9 KB
Loading

assets/cuda_bit_1.png

61.5 KB
Loading

assets/cuda_bit_1_58.png

72 KB
Loading

assets/plot_shapes_cpu.png

65.9 KB
Loading

assets/plot_shapes_cuda.png

72 KB
Loading
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
"""
2+
Benchmark CPU binary (1-bit) multipliers on a given list of matrix shapes.
3+
4+
Edit SHAPES and K_VALUES below to configure the benchmark.
5+
Timing: median inference latency (preprocessing excluded).
6+
"""
7+
8+
import csv
9+
import importlib
10+
import inspect
11+
import os
12+
import sys
13+
import time
14+
from pathlib import Path
15+
16+
import numpy as np
17+
import torch
18+
19+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
20+
21+
# ---------------------------------------------------------------------------
22+
# Configure here
23+
# ---------------------------------------------------------------------------
24+
25+
SHAPES = [
26+
(1024, 1024),
27+
(2048, 2048),
28+
(4096, 4096),
29+
(8192, 8192),
30+
(16384, 16384),
31+
(32768, 32768),
32+
]
33+
34+
K_VALUES = [2, 4, 6, 8, 10]
35+
36+
# Limit to these method labels; empty list = all discovered methods
37+
METHODS = ["BitNet", "RSR", "pytorch"]
38+
39+
REPEATS = 30
40+
WARMUP = 10
41+
42+
# ---------------------------------------------------------------------------
43+
# Helpers
44+
# ---------------------------------------------------------------------------
45+
46+
47+
def random_binary_matrix(rows, cols):
48+
return torch.randint(0, 2, (rows, cols), dtype=torch.float32)
49+
50+
51+
def bench(multiplier, v, warmup=WARMUP, repeats=REPEATS):
52+
for _ in range(warmup):
53+
multiplier(v)
54+
times = []
55+
for _ in range(repeats):
56+
t0 = time.perf_counter()
57+
multiplier(v)
58+
t1 = time.perf_counter()
59+
times.append(t1 - t0)
60+
return np.median(times)
61+
62+
63+
def fmt(t):
64+
if t is None or np.isnan(t):
65+
return "N/A"
66+
return f"{t * 1e3:.3f}ms"
67+
68+
69+
# ---------------------------------------------------------------------------
70+
# Version discovery
71+
# ---------------------------------------------------------------------------
72+
73+
_LABEL_MAP = {
74+
"bitnet": "BitNet",
75+
"tmac": "T-MAC",
76+
"rsr_cpp": "v1",
77+
"rsr_cpp_v2_4": "v2.4",
78+
"rsr_cpp_v4_2": "v4.2",
79+
"rsr_adaptive": "adaptive",
80+
"rsr_cpp_nonsquare": "RSR",
81+
}
82+
83+
_EXCLUDE = {"__init__", "base"}
84+
85+
86+
def _stem_to_label(stem):
87+
if stem in _LABEL_MAP:
88+
return _LABEL_MAP[stem]
89+
if stem.startswith("rsr_cpp_v"):
90+
return "v" + stem[len("rsr_cpp_v") :].replace("_", ".")
91+
return stem
92+
93+
94+
def discover_versions():
95+
versions = []
96+
97+
from multiplier.bit_1.pytorch import PytorchBF16Multiplier
98+
99+
versions.append(("pytorch", PytorchBF16Multiplier, False))
100+
101+
cpu_dir = Path(__file__).resolve().parents[2] / "multiplier" / "bit_1" / "cpu"
102+
for p in sorted(cpu_dir.glob("*.py")):
103+
if p.stem in _EXCLUDE or p.stem.startswith("_"):
104+
continue
105+
full = f"multiplier.bit_1.cpu.{p.stem}"
106+
label = _stem_to_label(p.stem)
107+
try:
108+
mod = importlib.import_module(full)
109+
except Exception as e:
110+
print(f" [skip {p.stem}: {e}]")
111+
continue
112+
cls = next(
113+
(
114+
obj
115+
for _, obj in inspect.getmembers(mod, inspect.isclass)
116+
if obj.__module__ == full and obj.__name__.endswith("Multiplier")
117+
),
118+
None,
119+
)
120+
if cls is None:
121+
continue
122+
needs_k = "k" in inspect.signature(cls.__init__).parameters
123+
versions.append((label, cls, needs_k))
124+
125+
return versions
126+
127+
128+
# ---------------------------------------------------------------------------
129+
# Main
130+
# ---------------------------------------------------------------------------
131+
132+
133+
def main():
134+
versions = discover_versions()
135+
if METHODS:
136+
versions = [(l, c, nk) for l, c, nk in versions if l in METHODS]
137+
138+
baselines = [(l, c) for l, c, nk in versions if not nk]
139+
rsr_vers = [(l, c) for l, c, nk in versions if nk]
140+
all_labels = [l for l, _ in baselines] + [l for l, _ in rsr_vers]
141+
142+
reports_dir = Path(__file__).parent / "reports"
143+
reports_dir.mkdir(parents=True, exist_ok=True)
144+
csv_path = reports_dir / "results_shapes_cpu.csv"
145+
csv_file = open(csv_path, "w", newline="")
146+
writer = csv.writer(csv_file)
147+
writer.writerow(["rows", "cols", "k"] + all_labels)
148+
149+
col_w = 12
150+
151+
for rows, cols in SHAPES:
152+
print(f"\n{'='*80}")
153+
print(f" shape = ({rows}, {cols})")
154+
print(f"{'='*80}")
155+
156+
M = random_binary_matrix(rows, cols)
157+
v = torch.randn(cols, dtype=torch.float32)
158+
159+
base_times = []
160+
for lbl, cls in baselines:
161+
try:
162+
m = cls(M)
163+
t = bench(m, v)
164+
except Exception as e:
165+
print(f" [error {lbl}: {e}]")
166+
t = float("nan")
167+
base_times.append(t)
168+
169+
header = f" {'k':>4} " + " ".join(f"{c:>{col_w}}" for c in all_labels)
170+
print(f"\n [Inference — median over {REPEATS} runs]")
171+
print(header)
172+
print(" " + "-" * (len(header) - 2))
173+
174+
for k in K_VALUES:
175+
rsr_times = []
176+
for lbl, cls in rsr_vers:
177+
if rows % k != 0:
178+
rsr_times.append(float("nan"))
179+
continue
180+
try:
181+
m = cls(M, k)
182+
rsr_times.append(bench(m, v))
183+
except Exception as e:
184+
print(f" [error {lbl} k={k}: {e}]")
185+
rsr_times.append(float("nan"))
186+
187+
all_times = base_times + rsr_times
188+
valid = [t for t in all_times if not np.isnan(t)]
189+
best = min(valid) if valid else None
190+
191+
cells = []
192+
for t in all_times:
193+
s = fmt(t)
194+
if best is not None and not np.isnan(t) and abs(t - best) < 1e-9:
195+
s = f"*{s}*"
196+
cells.append(s.rjust(col_w))
197+
198+
print(f" {k:>4} " + " ".join(cells))
199+
writer.writerow(
200+
[rows, cols, k]
201+
+ ["" if np.isnan(t) else round(t * 1e3, 6) for t in all_times]
202+
)
203+
csv_file.flush()
204+
205+
print()
206+
207+
csv_file.close()
208+
print(f"Results saved to {csv_path}")
209+
210+
211+
if __name__ == "__main__":
212+
main()

0 commit comments

Comments
 (0)