Skip to content

Commit 66576d2

Browse files
apartsinclaude
andcommitted
Major content expansion: new sections, appendix overhaul, structural fixes
New content: - Section 13.8: Data Augmentation for LLMs (EDA, back-translation, LLM-powered augmentation) - Section 15.4: Soft Prompts (Prompt Tuning, Prefix Tuning, P-Tuning v1/v2) - Section 20.9: Source Attribution and Citation in RAG (inline citation, NLI verification, ALCE) - Section 32.13: Federated Learning for LLMs (FedAvg, federated LoRA, Flower framework) - Expanded Code Models coverage in 7.2 (FIM training, pass@k, DeepSeek-Coder, StarCoder2) - New Appendix T sections: T.1 PySpark for LLM Data Pipelines, T.4 Databricks AI Appendix overhaul (22 index pages, 98 section files): - Standardized section lists, figures, headers across all appendices - Removed duplicate inline content from appendices A-J (~3600 lines) - Added Big Picture, Prerequisites, When to Use callouts to A-J - Renamed 20 appendix titles for clarity and consistency - Reordered Appendix T sections around core platforms - Updated part-labels from "Appendices" to book title ToC improvements: - Added group headers (Foundations, Reference, Framework, Infrastructure, Ecosystem) - Updated all appendix and framework titles to full descriptive names - Added section 15.4 and all new sections to detailed listings Structural fixes in 8 long sections: - Fixed duplicate h2 numbering in 16.1, 18.4, 20.6 - Moved misplaced content before Key Takeaways in 12.5, 18.2, 19.4, 20.6 - Fixed skipped h2 numbering in 13.2 Other fixes: - Fixed unclosed <a> tag in section-21.3 and duplicate <code> tags in 32.1, 33.3 - Added detect_unclosed_tags.py audit script - CSS: added .dense-group-header for ToC appendix groups - Updated nav chains for all new and reordered sections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 55e8b83 commit 66576d2

200 files changed

Lines changed: 6681 additions & 5851 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/usr/bin/env python3
2+
"""Detect unclosed HTML tags across all book pages.
3+
4+
Checks for mismatched open/close counts of inline and block tags
5+
that commonly cause rendering issues when left unclosed.
6+
Uses stack-based matching to identify the exact unclosed tag location.
7+
8+
Usage:
9+
python detect_unclosed_tags.py [--verbose]
10+
"""
11+
12+
import re
13+
import sys
14+
import glob
15+
import argparse
16+
from pathlib import Path
17+
18+
# Tags to check: ones that cause visible rendering problems when unclosed
19+
CHECKED_TAGS = ["a", "strong", "em", "code", "span", "div", "p", "li", "td", "th", "tr", "table",
20+
"pre", "blockquote", "section", "article", "aside", "details", "summary",
21+
"h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "dl", "dt", "dd",
22+
"figure", "figcaption", "label", "button", "select", "textarea", "form"]
23+
24+
# Self-closing / void tags (never need a closing tag)
25+
VOID_TAGS = {"br", "hr", "img", "input", "meta", "link", "source", "area", "base", "col", "embed",
26+
"param", "track", "wbr"}
27+
28+
FILE_GLOBS = [
29+
"front-matter/**/*.html",
30+
"part-*/module-*/**/*.html",
31+
"part-*/capstone*/**/*.html",
32+
"appendices/**/*.html",
33+
"index.html",
34+
"toc.html",
35+
]
36+
37+
38+
def find_unclosed_tags(filepath: Path, verbose: bool = False) -> list[dict]:
39+
"""Find unclosed tags in an HTML file using stack-based matching."""
40+
issues = []
41+
try:
42+
content = filepath.read_text(encoding="utf-8")
43+
except Exception as e:
44+
issues.append({"file": str(filepath), "tag": "?", "line": 0, "msg": f"Read error: {e}"})
45+
return issues
46+
47+
for tag in CHECKED_TAGS:
48+
# Find all open tags (not self-closing)
49+
open_pattern = re.compile(rf"<{tag}(\s[^>]*)?>", re.IGNORECASE)
50+
close_pattern = re.compile(rf"</{tag}\s*>", re.IGNORECASE)
51+
selfclose_pattern = re.compile(rf"<{tag}(\s[^>]*)?\s*/>", re.IGNORECASE)
52+
53+
events = []
54+
for m in open_pattern.finditer(content):
55+
# Skip self-closing
56+
if selfclose_pattern.match(content[m.start():m.end() + 2]):
57+
continue
58+
line_num = content[:m.start()].count("\n") + 1
59+
events.append((m.start(), "open", line_num, m.group()[:100]))
60+
61+
for m in close_pattern.finditer(content):
62+
line_num = content[:m.start()].count("\n") + 1
63+
events.append((m.start(), "close", line_num, m.group()))
64+
65+
events.sort(key=lambda x: x[0])
66+
67+
stack = []
68+
for pos, kind, line_num, text in events:
69+
if kind == "open":
70+
stack.append((pos, line_num, text))
71+
else:
72+
if stack:
73+
stack.pop()
74+
else:
75+
issues.append({
76+
"file": str(filepath),
77+
"tag": tag,
78+
"line": line_num,
79+
"msg": f"Extra </{tag}> at line {line_num}",
80+
})
81+
82+
for pos, line_num, text in stack:
83+
issues.append({
84+
"file": str(filepath),
85+
"tag": tag,
86+
"line": line_num,
87+
"msg": f"Unclosed <{tag}> at line {line_num}: {text}",
88+
})
89+
90+
return issues
91+
92+
93+
def main():
94+
parser = argparse.ArgumentParser(description="Detect unclosed HTML tags across the book")
95+
parser.add_argument("--verbose", "-v", action="store_true")
96+
args = parser.parse_args()
97+
98+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
99+
100+
root = Path(__file__).resolve().parents[3] # LLMCourse root
101+
102+
files = set()
103+
for pattern in FILE_GLOBS:
104+
files.update(root.glob(pattern))
105+
106+
files = sorted(files)
107+
print(f"Scanning {len(files)} HTML files for unclosed tags...")
108+
109+
all_issues = []
110+
for f in files:
111+
issues = find_unclosed_tags(f, args.verbose)
112+
all_issues.extend(issues)
113+
114+
if not all_issues:
115+
print("No unclosed tag issues found!")
116+
return
117+
118+
# Group by file
119+
by_file: dict[str, list] = {}
120+
for issue in all_issues:
121+
by_file.setdefault(issue["file"], []).append(issue)
122+
123+
# Filter to only critical tags that cause rendering problems
124+
critical_tags = {"a", "strong", "em", "code", "span", "pre"}
125+
critical_issues = [i for i in all_issues if i["tag"] in critical_tags]
126+
other_issues = [i for i in all_issues if i["tag"] not in critical_tags]
127+
128+
if critical_issues:
129+
print(f"\n=== CRITICAL: {len(critical_issues)} unclosed inline/formatting tags ===")
130+
for issue in critical_issues:
131+
rel = Path(issue["file"]).relative_to(root)
132+
print(f" {rel}:{issue['line']} - {issue['msg']}")
133+
134+
if other_issues and args.verbose:
135+
print(f"\n=== INFO: {len(other_issues)} unclosed block tags (may be intentional) ===")
136+
for issue in other_issues:
137+
rel = Path(issue["file"]).relative_to(root)
138+
print(f" {rel}:{issue['line']} - {issue['msg']}")
139+
140+
print(f"\nTotal: {len(critical_issues)} critical, {len(other_issues)} block-level")
141+
142+
143+
if __name__ == "__main__":
144+
main()

0 commit comments

Comments
 (0)