Skip to content

Commit 75d26f1

Browse files
authored
feat: add NEON support on arm64
llhttp benchmarks on M2 mac. Before: url (C) 8192.00 mb | 2190.59 mb/s | 44173087.93 ops/sec | 3.74 s http: "seanmonstar/httparse" (C) 8192.00 mb | 1756.10 mb/s | 2619357.05 ops/sec | 4.66 s http: "nodejs/http-parser" (C) 8192.00 mb | 1467.88 mb/s | 2959960.95 ops/sec | 5.58 s After this commit: url (C) 8192.00 mb | 2211.35 mb/s | 44591682.98 ops/sec | 3.70 s http: "seanmonstar/httparse" (C) 8192.00 mb | 3183.58 mb/s | 4748540.45 ops/sec | 2.57 s http: "nodejs/http-parser" (C) 8192.00 mb | 2123.17 mb/s | 4281349.70 ops/sec | 3.86 s
1 parent 660bb81 commit 75d26f1

3 files changed

Lines changed: 93 additions & 2 deletions

File tree

src/implementation/c/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ export class CCompiler {
4949
out.push('#endif /* __SSE4_2__ */');
5050
out.push('');
5151

52+
out.push('#ifdef __ARM_NEON__');
53+
out.push(' #include <arm_neon.h>');
54+
out.push('#endif /* __ARM_NEON__ */');
55+
out.push('');
56+
5257
out.push('#ifdef __wasm__');
5358
out.push(' #include <wasm_simd128.h>');
5459
out.push('#endif /* __wasm__ */');

src/implementation/c/node/table-lookup.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ const SSE_RANGES_LEN = 16;
1111
// _mm_cmpestri takes 128bit input
1212
const SSE_RANGES_PAD = 16;
1313
const MAX_SSE_CALLS = 2;
14+
const MAX_NEON_RANGES = 6;
1415
const MAX_WASM_RANGES = 6;
1516
const SSE_ALIGNMENT = 16;
1617

@@ -37,6 +38,7 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
3738
// stream for vectorized processing.
3839
if (this.canVectorize()) {
3940
this.buildSSE(out);
41+
this.buildNeon(out);
4042
this.buildWASM(out);
4143
}
4244

@@ -183,6 +185,91 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
183185
return true;
184186
}
185187

188+
private buildNeon(out: string[]): boolean {
189+
const ctx = this.compilation;
190+
191+
const edge = this.ref.edges[0];
192+
assert(edge !== undefined);
193+
194+
const ranges = this.buildRanges(edge);
195+
196+
if (ranges.length === 0) {
197+
return false;
198+
}
199+
200+
// Way too many calls would be required
201+
if (ranges.length > MAX_NEON_RANGES) {
202+
return false;
203+
}
204+
205+
out.push('#ifdef __ARM_NEON__');
206+
out.push(`while (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`);
207+
out.push(' uint8x16_t input;');
208+
out.push(' uint8x16_t single;');
209+
out.push(' uint8x16_t mask;');
210+
out.push(' uint8x8_t narrow;');
211+
out.push(' uint64_t match_mask;');
212+
out.push(' int match_len;');
213+
out.push('');
214+
out.push(' /* Load input */');
215+
out.push(` input = vld1q_u8(${ctx.posArg()});`);
216+
217+
out.push(' /* Find first character that does not match `ranges` */');
218+
function v128(value: number): string {
219+
return `vdupq_n_u8(${ctx.toChar(value)})`;
220+
}
221+
222+
for (let off = 0; off < ranges.length; off += 2) {
223+
const start = ranges[off];
224+
const end = ranges[off + 1];
225+
assert(start !== undefined);
226+
assert(end !== undefined);
227+
228+
// Same character, equality is sufficient (and faster)
229+
if (start === end) {
230+
out.push(` single = vceqq_u8(input, ${v128(start)});`);
231+
} else {
232+
out.push(` single = vandq_u16(`);
233+
out.push(` vcgeq_u8(input, ${v128(start)}),`);
234+
out.push(` vcleq_u8(input, ${v128(end)})`);
235+
out.push(' );');
236+
}
237+
238+
if (off === 0) {
239+
out.push(' mask = single;');
240+
} else {
241+
out.push(' mask = vorrq_u16(mask, single);');
242+
}
243+
}
244+
245+
// https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
246+
out.push(' narrow = vshrn_n_u16(mask, 4);');
247+
out.push(' match_mask = ~vget_lane_u64(vreinterpret_u64_u8(narrow), 0);');
248+
out.push(' match_len = __builtin_ctzll(match_mask) >> 2;');
249+
out.push(' if (match_len != 16) {');
250+
out.push(` ${ctx.posArg()} += match_len;`);
251+
{
252+
const tmp: string[] = [];
253+
this.tailTo(tmp, this.ref.otherwise!);
254+
ctx.indent(out, tmp, ' ');
255+
}
256+
out.push(' }');
257+
out.push(` ${ctx.posArg()} += 16;`);
258+
out.push('}');
259+
260+
out.push(`if (${ctx.posArg()} == ${ctx.endPosArg()}) {`);
261+
{
262+
const tmp: string[] = [];
263+
this.pause(tmp);
264+
this.compilation.indent(out, tmp, ' ');
265+
}
266+
out.push('}');
267+
268+
out.push('#endif /* __ARM_NEON__ */');
269+
270+
return true;
271+
}
272+
186273
private buildWASM(out: string[]): boolean {
187274
const ctx = this.compilation;
188275

test/fixtures/extra.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ int llparse__pause_once(llparse_t* s, const char* p, const char* endp) {
7979
}
8080

8181

82-
int llparse__test_init() {
82+
void llparse__test_init(llparse_t*) {
8383
llparse__pause_once_counter = 0;
84-
return 0;
8584
}

0 commit comments

Comments
 (0)