Skip to content

Commit 5d54b5a

Browse files
committed
Fix tag parsing to follow spec
1 parent 79df27a commit 5d54b5a

5 files changed

Lines changed: 112 additions & 89 deletions

File tree

DOCUMENTATION.md

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -193,49 +193,49 @@ Source: `bench/results/latest.json` (`stable` profile).
193193

194194
| Fixture | ours | lol-html | lexbor |
195195
|---|---:|---:|---:|
196-
| `rust-lang.html` | 1776.74 | 1414.69 | 339.72 |
197-
| `wiki-html.html` | 1444.00 | 1204.63 | 269.72 |
198-
| `mdn-html.html` | 2674.83 | 1843.82 | 407.58 |
199-
| `w3-html52.html` | 1022.94 | 625.82 | 197.90 |
200-
| `hn.html` | 1026.96 | 859.75 | 187.58 |
201-
| `python-org.html` | 1566.90 | 1271.55 | 239.03 |
202-
| `kernel-org.html` | 1513.60 | 1271.77 | 289.64 |
203-
| `gnu-org.html` | 1891.88 | 1116.18 | 273.89 |
204-
| `ziglang-org.html` | 1204.24 | 1150.31 | 231.72 |
205-
| `ziglang-doc-master.html` | 1023.89 | 1023.31 | 207.34 |
206-
| `wikipedia-unicode-list.html` | 1264.07 | 1056.25 | 219.03 |
207-
| `whatwg-html-spec.html` | 1113.04 | 833.01 | 225.13 |
208-
| `synthetic-forms.html` | 936.16 | 680.43 | 173.78 |
209-
| `synthetic-table-grid.html` | 811.26 | 620.73 | 169.49 |
210-
| `synthetic-list-nested.html` | 795.40 | 566.38 | 147.05 |
211-
| `synthetic-comments-doctype.html` | 1152.15 | 778.54 | 223.00 |
212-
| `synthetic-template-rich.html` | 668.66 | 415.97 | 123.23 |
213-
| `synthetic-whitespace-noise.html` | 1002.33 | 895.15 | 172.90 |
214-
| `synthetic-news-feed.html` | 826.34 | 558.76 | 146.02 |
215-
| `synthetic-ecommerce.html` | 792.52 | 533.54 | 142.63 |
216-
| `synthetic-forum-thread.html` | 925.93 | 609.20 | 146.80 |
196+
| `rust-lang.html` | 1761.16 | 1494.33 | 338.55 |
197+
| `wiki-html.html` | 1640.45 | 945.59 | 270.88 |
198+
| `mdn-html.html` | 1958.45 | 1608.04 | 357.76 |
199+
| `w3-html52.html` | 910.95 | 708.66 | 173.18 |
200+
| `hn.html` | 1159.34 | 831.40 | 197.83 |
201+
| `python-org.html` | 1515.96 | 1188.99 | 262.11 |
202+
| `kernel-org.html` | 1417.57 | 1139.69 | 267.87 |
203+
| `gnu-org.html` | 1676.43 | 1321.14 | 290.69 |
204+
| `ziglang-org.html` | 1429.08 | 1224.85 | 267.44 |
205+
| `ziglang-doc-master.html` | 1083.64 | 1050.83 | 216.27 |
206+
| `wikipedia-unicode-list.html` | 1320.52 | 1082.30 | 233.50 |
207+
| `whatwg-html-spec.html` | 1140.34 | 861.74 | 228.68 |
208+
| `synthetic-forms.html` | 1121.33 | 767.17 | 193.16 |
209+
| `synthetic-table-grid.html` | 859.06 | 710.84 | 174.37 |
210+
| `synthetic-list-nested.html` | 936.91 | 646.39 | 166.53 |
211+
| `synthetic-comments-doctype.html` | 1350.02 | 931.86 | 227.39 |
212+
| `synthetic-template-rich.html` | 736.39 | 472.56 | 147.89 |
213+
| `synthetic-whitespace-noise.html` | 1170.18 | 1003.19 | 184.57 |
214+
| `synthetic-news-feed.html` | 933.79 | 638.74 | 160.58 |
215+
| `synthetic-ecommerce.html` | 854.67 | 629.06 | 167.41 |
216+
| `synthetic-forum-thread.html` | 935.98 | 640.10 | 165.39 |
217217

218218
#### Query Match Throughput (ours)
219219

220220
| Case | ours ops/s | ours ns/op |
221221
|---|---:|---:|
222-
| `attr-heavy-button` | 1229788.68 | 813.15 |
223-
| `attr-heavy-nav` | 1200562.39 | 832.94 |
222+
| `attr-heavy-button` | 1260674.48 | 793.23 |
223+
| `attr-heavy-nav` | 1257905.76 | 794.97 |
224224

225225
#### Cached Query Throughput (ours)
226226

227227
| Case | ours ops/s | ours ns/op |
228228
|---|---:|---:|
229-
| `attr-heavy-button` | 1438363.33 | 695.23 |
230-
| `attr-heavy-nav` | 1462039.63 | 683.98 |
229+
| `attr-heavy-button` | 1483060.91 | 674.28 |
230+
| `attr-heavy-nav` | 1483527.34 | 674.07 |
231231

232232
#### Query Parse Throughput (ours)
233233

234234
| Selector case | Ops/s | ns/op |
235235
|---|---:|---:|
236-
| `simple` | 19027989.41 | 52.55 |
237-
| `complex` | 6118575.58 | 163.44 |
238-
| `grouped` | 7187707.80 | 139.13 |
236+
| `simple` | 19114633.13 | 52.32 |
237+
| `complex` | 6259641.80 | 159.75 |
238+
| `grouped` | 6915365.57 | 144.61 |
239239

240240
For full per-parser, per-fixture tables and gate output:
241241
- `bench/results/latest.md`
@@ -260,7 +260,6 @@ Tracked suites:
260260
- parser suites:
261261
- html5lib tree-construction subset
262262
- WHATWG HTML parsing corpus (via WPT `html/syntax/parsing/html5lib_*.html`)
263-
- WPT HTML parsing corpus (non-`html5lib_*` files under `html/syntax/parsing/`)
264263

265264
Fetched suite repos are cached under `bench/.cache/suites/` (gitignored).
266265

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,16 @@ Source: `bench/results/latest.json` (`stable` profile).
2525
### Parse Throughput (Average Across Fixtures)
2626

2727
```text
28-
ours │████████████████████│ 1211.13 MB/s (100.00%)
29-
lol-html │███████████████░░░░░│ 920.47 MB/s (76.00%)
30-
lexbor │████░░░░░░░░░░░░░░░░│ 215.87 MB/s (17.82%)
28+
ours │████████████████████│ 1233.92 MB/s (100.00%)
29+
lol-html │███████████████░░░░░│ 947.50 MB/s (76.79%)
30+
lexbor │████░░░░░░░░░░░░░░░░│ 223.43 MB/s (18.11%)
3131
```
3232

3333
### Conformance Snapshot
3434

35-
| Profile | nwmatcher | qwery_contextual | html5lib subset | WHATWG HTML parsing | WPT HTML parsing |
36-
|---|---:|---:|---:|---:|---:|
37-
| `strictest/fastest` | 20/20 (0 failed) | 54/54 (0 failed) | 523/600 (77 failed) | 439/500 (61 failed) | 439/500 (61 failed) |
35+
| Profile | nwmatcher | qwery_contextual | html5lib subset | WHATWG HTML parsing |
36+
|---|---:|---:|---:|---:|
37+
| `strictest/fastest` | 20/20 (0 failed) | 54/54 (0 failed) | 524/600 (76 failed) | 440/500 (60 failed) |
3838

3939
Source: `bench/results/external_suite_report.json`
4040
<!-- README_AUTO_SUMMARY:END -->

src/html/tables.zig

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,10 @@ pub fn isIdentChar(c: u8) bool {
3636
return isIdentStart(c) or (c >= '0' and c <= '9') or c == '-' or c == '.';
3737
}
3838

39-
/// Returns whether byte is valid in tag names.
39+
/// Returns whether byte is consumed by the HTML tag-name state.
40+
/// Matches the tokenizer shape: continue until whitespace, `/`, `>`, or NUL.
4041
pub fn isTagNameChar(c: u8) bool {
41-
return isIdentChar(c);
42+
return !isWhitespace(c) and c != '/' and c != '>' and c != 0;
4243
}
4344

4445
/// Precomputed lowercase conversion table.
@@ -106,3 +107,10 @@ test "lower table" {
106107
try std.testing.expect(lower('A') == 'a');
107108
try std.testing.expect(lower('z') == 'z');
108109
}
110+
111+
test "tag name state includes < and excludes delimiters" {
112+
try std.testing.expect(isTagNameChar('<'));
113+
try std.testing.expect(!isTagNameChar('>'));
114+
try std.testing.expect(!isTagNameChar('/'));
115+
try std.testing.expect(!isTagNameChar(' '));
116+
}

src/root.zig

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,18 @@ test "smoke parse/query" {
6868
try std.testing.expectEqualStrings("div", parent.tagName());
6969
try std.testing.expect(doc.queryOne("div > span.k") != null);
7070
}
71+
72+
test "tag-name state keeps < inside malformed start tag name" {
73+
const alloc = std.testing.allocator;
74+
const opts: ParseOptions = .{};
75+
const Document = opts.GetDocument();
76+
77+
var doc = Document.init(alloc);
78+
defer doc.deinit();
79+
80+
var src = "<div<div>".*;
81+
try doc.parse(&src, .{});
82+
83+
const first = doc.nodeAt(1) orelse return error.TestUnexpectedResult;
84+
try std.testing.expectEqualStrings("div<div", first.tagName());
85+
}

0 commit comments

Comments
 (0)