Skip to content

Commit 0553d3a

Browse files
committed
Add SEO post-processing for main wiki
1 parent b4984df commit 0553d3a

5 files changed

Lines changed: 282 additions & 3 deletions

File tree

.github/workflows/build_master.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ jobs:
3434
# Build the mdBook
3535
- name: Build mdBook
3636
run: MDBOOK_BOOK__LANGUAGE=en mdbook build || (echo "Error logs" && cat hacktricks-preprocessor-error.log && echo "" && echo "" && echo "Debug logs" && (cat hacktricks-preprocessor.log | tail -n 20) && exit 1)
37+
38+
- name: Post-process SEO artifacts
39+
run: |
40+
python3 scripts/seo_postprocess.py pages \
41+
--book-dir ./book \
42+
--site-url https://hacktricks.wiki \
43+
--lang en \
44+
--default-lang en \
45+
--site-name "HackTricks"
3746
3847
- name: Install GitHub CLI
3948
run: |
@@ -176,6 +185,15 @@ jobs:
176185
- name: Sync to S3
177186
run: aws s3 sync ./book s3://hacktricks-wiki/en --delete
178187

188+
- name: Upload root sitemap index
189+
run: |
190+
LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-wiki --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
191+
if [ -z "$LANGS" ]; then
192+
LANGS="en"
193+
fi
194+
python3 scripts/seo_postprocess.py index --site-url https://hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
195+
aws s3 cp ./sitemap.xml s3://hacktricks-wiki/sitemap.xml --content-type application/xml --cache-control max-age=300
196+
179197
- name: Upload root ads.txt
180198
run: aws s3 cp ./src/ads.txt s3://hacktricks-wiki/ads.txt --content-type text/plain --cache-control max-age=300
181199

.github/workflows/translate_all.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,15 @@ jobs:
248248
with:
249249
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
250250
aws-region: us-east-1
251+
252+
- name: Post-process SEO artifacts
253+
run: |
254+
python3 scripts/seo_postprocess.py pages \
255+
--book-dir ./book \
256+
--site-url https://hacktricks.wiki \
257+
--lang "$BRANCH" \
258+
--default-lang en \
259+
--site-name "HackTricks"
251260
252261
# Sync the build to S3
253262
- name: Sync to S3
@@ -259,3 +268,12 @@ jobs:
259268
echo "Sync completed"
260269
echo "Cat 3 files from the book"
261270
find . -type f -name 'index.html' -print | head -n 3 | xargs -r cat
271+
272+
- name: Refresh root sitemap index
273+
run: |
274+
LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-wiki --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
275+
if [ -z "$LANGS" ]; then
276+
LANGS="en"
277+
fi
278+
python3 scripts/seo_postprocess.py index --site-url https://hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
279+
aws s3 cp ./sitemap.xml s3://hacktricks-wiki/sitemap.xml --content-type application/xml --cache-control max-age=300

scripts/seo_postprocess.py

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
import argparse
2+
import html
3+
import os
4+
import re
5+
from datetime import datetime, timezone
6+
from pathlib import Path
7+
import xml.etree.ElementTree as ET
8+
9+
10+
DEFAULT_LANGUAGES = [
11+
"af",
12+
"zh",
13+
"es",
14+
"en",
15+
"fr",
16+
"de",
17+
"el",
18+
"hi",
19+
"it",
20+
"ja",
21+
"ko",
22+
"pl",
23+
"pt",
24+
"sr",
25+
"sw",
26+
"tr",
27+
"uk",
28+
]
29+
30+
SKIP_HTML = {"404.html", "print.html", "toc.html"}
31+
SEO_START = "<!-- HT_SEO_START -->"
32+
SEO_END = "<!-- HT_SEO_END -->"
33+
34+
35+
def parse_args():
36+
parser = argparse.ArgumentParser()
37+
subparsers = parser.add_subparsers(dest="command", required=True)
38+
39+
pages = subparsers.add_parser("pages")
40+
pages.add_argument("--book-dir", required=True)
41+
pages.add_argument("--site-url", required=True)
42+
pages.add_argument("--lang", required=True)
43+
pages.add_argument("--default-lang", default="en")
44+
pages.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
45+
pages.add_argument("--site-name", default="HackTricks")
46+
47+
index_cmd = subparsers.add_parser("index")
48+
index_cmd.add_argument("--site-url", required=True)
49+
index_cmd.add_argument("--languages", required=True)
50+
index_cmd.add_argument("--output", required=True)
51+
52+
return parser.parse_args()
53+
54+
55+
def parse_languages(raw):
56+
langs = []
57+
for item in raw.split(","):
58+
code = item.strip()
59+
if re.fullmatch(r"[a-z]{2}", code):
60+
langs.append(code)
61+
return sorted(set(langs))
62+
63+
64+
def iter_html_files(book_dir):
65+
for html_file in sorted(Path(book_dir).rglob("*.html")):
66+
if html_file.name in SKIP_HTML:
67+
continue
68+
yield html_file
69+
70+
71+
def canonical_url(site_url, lang, rel_path):
72+
return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
73+
74+
75+
def clean_text(fragment):
76+
fragment = re.sub(r"<script\b[^>]*>.*?</script>", " ", fragment, flags=re.I | re.S)
77+
fragment = re.sub(r"<style\b[^>]*>.*?</style>", " ", fragment, flags=re.I | re.S)
78+
fragment = re.sub(r"<[^>]+>", " ", fragment)
79+
fragment = html.unescape(fragment)
80+
fragment = re.sub(r"\s+", " ", fragment).strip()
81+
return fragment
82+
83+
84+
def trim_description(text, fallback):
85+
text = text or fallback
86+
text = re.sub(r"\s+", " ", text).strip()
87+
if len(text) <= 160:
88+
return text
89+
cut = text[:157]
90+
if " " in cut:
91+
cut = cut.rsplit(" ", 1)[0]
92+
return cut + "..."
93+
94+
95+
def extract_description(document, fallback):
96+
main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
97+
scope = main_match.group(1) if main_match else document
98+
99+
for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
100+
for match in re.finditer(pattern, scope, flags=re.I | re.S):
101+
text = clean_text(match.group(1))
102+
if len(text) >= 40:
103+
return trim_description(text, fallback)
104+
105+
return trim_description(clean_text(scope), fallback)
106+
107+
108+
def build_seo_block(site_url, lang, rel_path, languages, default_lang):
109+
current_url = canonical_url(site_url, lang, rel_path)
110+
lines = [SEO_START, f'<link rel="canonical" href="{html.escape(current_url, quote=True)}">']
111+
112+
for alt_lang in languages:
113+
alt_url = canonical_url(site_url, alt_lang, rel_path)
114+
lines.append(
115+
f'<link rel="alternate" hreflang="{alt_lang}" href="{html.escape(alt_url, quote=True)}">'
116+
)
117+
118+
default_url = canonical_url(site_url, default_lang, rel_path)
119+
lines.append(f'<link rel="alternate" hreflang="x-default" href="{html.escape(default_url, quote=True)}">')
120+
lines.append(SEO_END)
121+
return "\n ".join(lines)
122+
123+
124+
def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
125+
title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
126+
page_title = clean_text(title_match.group(1)) if title_match else site_name
127+
fallback_description = f"{site_name}: {page_title}"
128+
description = extract_description(document, fallback_description)
129+
seo_block = build_seo_block(site_url, lang, rel_path, languages, default_lang)
130+
131+
document = re.sub(
132+
r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*",
133+
"\n",
134+
document,
135+
flags=re.S,
136+
)
137+
138+
if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
139+
document = re.sub(
140+
r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
141+
r"\1" + html.escape(description, quote=True) + r"\2",
142+
document,
143+
count=1,
144+
flags=re.I,
145+
)
146+
elif title_match:
147+
document = document.replace(
148+
title_match.group(0),
149+
title_match.group(0) + f'\n <meta name="description" content="{html.escape(description, quote=True)}">',
150+
1,
151+
)
152+
153+
document = re.sub(r"</head>", f" {seo_block}\n </head>", document, count=1, flags=re.I)
154+
return document
155+
156+
157+
def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
158+
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
159+
ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
160+
161+
urlset = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}urlset")
162+
163+
for html_file in iter_html_files(book_dir):
164+
rel_path = html_file.relative_to(book_dir)
165+
url = ET.SubElement(urlset, "{http://www.sitemaps.org/schemas/sitemap/0.9}url")
166+
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
167+
site_url, lang, rel_path
168+
)
169+
lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
170+
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
171+
172+
for alt_lang in languages:
173+
ET.SubElement(
174+
url,
175+
"{http://www.w3.org/1999/xhtml}link",
176+
{
177+
"rel": "alternate",
178+
"hreflang": alt_lang,
179+
"href": canonical_url(site_url, alt_lang, rel_path),
180+
},
181+
)
182+
183+
ET.SubElement(
184+
url,
185+
"{http://www.w3.org/1999/xhtml}link",
186+
{
187+
"rel": "alternate",
188+
"hreflang": "x-default",
189+
"href": canonical_url(site_url, default_lang, rel_path),
190+
},
191+
)
192+
193+
tree = ET.ElementTree(urlset)
194+
output = Path(book_dir) / "sitemap.xml"
195+
tree.write(output, encoding="utf-8", xml_declaration=True)
196+
197+
198+
def process_pages(args):
199+
book_dir = Path(args.book_dir)
200+
languages = parse_languages(args.languages)
201+
202+
for html_file in iter_html_files(book_dir):
203+
rel_path = html_file.relative_to(book_dir)
204+
content = html_file.read_text(encoding="utf-8")
205+
updated = update_document(
206+
content,
207+
args.site_url,
208+
args.lang,
209+
rel_path,
210+
languages,
211+
args.default_lang,
212+
args.site_name,
213+
)
214+
html_file.write_text(updated, encoding="utf-8")
215+
216+
generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
217+
218+
219+
def generate_sitemap_index(args):
220+
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
221+
sitemapindex = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex")
222+
now = datetime.now(timezone.utc).date().isoformat()
223+
224+
for lang in parse_languages(args.languages):
225+
sitemap = ET.SubElement(sitemapindex, "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
226+
ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = (
227+
f"{args.site_url.rstrip('/')}/{lang}/sitemap.xml"
228+
)
229+
ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = now
230+
231+
ET.ElementTree(sitemapindex).write(args.output, encoding="utf-8", xml_declaration=True)
232+
233+
234+
def main():
235+
args = parse_args()
236+
if args.command == "pages":
237+
process_pages(args)
238+
elif args.command == "index":
239+
generate_sitemap_index(args)
240+
241+
242+
if __name__ == "__main__":
243+
main()

src/robots.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Sitemap: https://www.hacktricks.wiki/sitemap.xml
1+
Sitemap: https://hacktricks.wiki/sitemap.xml
22

33
User-agent: *
4-
Disallow:
4+
Disallow:

theme/sponsor.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
async function getSponsor() {
1818
const currentUrl = encodeURIComponent(window.location.href);
19-
const url = `https://book.hacktricks.wiki/sponsor?current_url=${currentUrl}`;
19+
const url = `https://hacktricks.wiki/sponsor?current_url=${currentUrl}`;
2020
try {
2121
const response = await fetch(url, { method: "GET" })
2222
if (!response.ok) {

0 commit comments

Comments
 (0)