diff --git a/activerecon/modules/endpoint_discovery.py b/activerecon/modules/endpoint_discovery.py
index d519fc8..9b2eb00 100644
--- a/activerecon/modules/endpoint_discovery.py
+++ b/activerecon/modules/endpoint_discovery.py
@@ -22,6 +22,7 @@
DEFAULT_ENDPOINT_LIMIT = 50
DEFAULT_HTTP_TIMEOUT = 5
PATH_STRING_RE = re.compile(r"""["'`](/[A-Za-z0-9._~:/?#\[\]@!$&()*+,;=%-]{1,200})["'`]""")
+TITLE_RE = re.compile(r"
]*>(.*?)", flags=re.IGNORECASE | re.DOTALL)
class EndpointHTMLParser(HTMLParser):
@@ -34,13 +35,21 @@ def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if "href" in attrs_dict:
- self.links.append((attrs_dict["href"], "html:href"))
+ rel = str(attrs_dict.get("rel", "")).lower()
+ if tag == "link" and "stylesheet" in rel:
+ self.links.append((attrs_dict["href"], "html:stylesheet"))
+ elif tag == "link" and "icon" in rel:
+ self.links.append((attrs_dict["href"], "html:icon"))
+ else:
+ self.links.append((attrs_dict["href"], "html:href"))
+ if tag == "script" and attrs_dict.get("src"):
+ self.links.append((attrs_dict["src"], "html:script-src"))
+ self.script_srcs.append(attrs_dict["src"])
if "src" in attrs_dict:
- self.links.append((attrs_dict["src"], "html:src"))
+ if tag != "script":
+ self.links.append((attrs_dict["src"], "html:src"))
if tag == "form" and attrs_dict.get("action"):
self.links.append((attrs_dict["action"], "html:form-action"))
- if tag == "script" and attrs_dict.get("src"):
- self.script_srcs.append(attrs_dict["src"])
def _web_recon_settings(config):
@@ -124,7 +133,7 @@ def _confidence(source):
return "low"
-def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None):
+def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None, note=None):
if not path or not path.startswith("/") or path.startswith("//") or len(path) > 250:
return
if path not in endpoints and len(endpoints) >= limit:
@@ -141,6 +150,8 @@ def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type
endpoints[path]["status_code"] = status_code
if content_type:
endpoints[path]["content_type"] = content_type
+ if note:
+ endpoints[path]["note"] = note
def _extract_paths_from_text(text):
@@ -175,6 +186,13 @@ def _content_type(response):
return response.headers.get("Content-Type") or response.headers.get("content-type") or ""
+def _title_from_html(text):
+ match = TITLE_RE.search(text or "")
+ if not match:
+ return None
+ return re.sub(r"\s+", " ", match.group(1)).strip()
+
+
def _is_found_probe(response):
if response is None:
return False
@@ -244,8 +262,10 @@ def get_if_allowed(url):
_add_endpoint(endpoints, path, f"response-header:{header_name}", endpoint_limit)
page_response = get_if_allowed(base_url)
+ root_title = None
if page_response is not None and page_response.status_code < 400 and "html" in _content_type(page_response).lower():
html_text = getattr(page_response, "text", "")[:200000]
+ root_title = _title_from_html(html_text)
parser = _parse_html(html_text)
for raw_link, source in parser.links:
path = _normalize_candidate(raw_link, base_url, same_origin_only)
@@ -272,6 +292,14 @@ def get_if_allowed(url):
response = get_if_allowed(urljoin(base_origin, path))
if not _is_found_probe(response):
continue
+ note = None
+ if (
+ response.status_code == 200
+ and root_title
+ and "html" in _content_type(response).lower()
+ and _title_from_html(getattr(response, "text", "")[:200000]) == root_title
+ ):
+ note = "Possible SPA fallback route"
_add_endpoint(
endpoints,
path,
@@ -279,6 +307,7 @@ def get_if_allowed(url):
endpoint_limit,
status_code=response.status_code,
content_type=_content_type(response),
+ note=note,
)
if path == "/robots.txt" and response.status_code < 400:
for disallow_path in _robots_disallow_paths(getattr(response, "text", "")):
diff --git a/activerecon/modules/report_generator.py b/activerecon/modules/report_generator.py
index c225713..7fecf17 100644
--- a/activerecon/modules/report_generator.py
+++ b/activerecon/modules/report_generator.py
@@ -1,5 +1,6 @@
import ipaddress
import logging
+from pathlib import PurePosixPath
from pathlib import Path
@@ -73,6 +74,94 @@ def _write_http_result(f, item):
f.write(f" - `{key}`: {value}\n")
+STATIC_ASSET_EXTENSIONS = {
+ ".css",
+ ".eot",
+ ".gif",
+ ".ico",
+ ".jpeg",
+ ".jpg",
+ ".js",
+ ".map",
+ ".png",
+ ".svg",
+ ".ttf",
+ ".webp",
+ ".woff",
+ ".woff2",
+}
+WELL_KNOWN_REPORT_PATHS = {
+ "/robots.txt",
+ "/sitemap.xml",
+ "/.well-known/security.txt",
+ "/swagger",
+ "/api-docs",
+ "/ftp",
+}
+
+
+def _path_without_query(path):
+ return str(path or "/").split("?", 1)[0].split("#", 1)[0]
+
+
+def _is_api_like_endpoint(path):
+ lower_path = str(path or "").lower()
+ return lower_path == "/api" or lower_path == "/rest" or lower_path.startswith("/api/") or lower_path.startswith("/rest/")
+
+
+def _is_static_asset(path):
+ clean_path = _path_without_query(path).lower()
+ filename = PurePosixPath(clean_path).name
+ return PurePosixPath(clean_path).suffix in STATIC_ASSET_EXTENSIONS or "chunk" in filename
+
+
+def _endpoint_category(endpoint):
+ path = endpoint.get("path", "")
+ lower_path = str(path).lower()
+ if _is_static_asset(path):
+ return "Static Assets"
+ if _is_api_like_endpoint(path):
+ return "API-like Endpoints"
+ if lower_path in WELL_KNOWN_REPORT_PATHS:
+ return "Well-known / Probed Paths"
+ return "Frontend Routes"
+
+
+def _endpoint_line(endpoint):
+ line = (
+ f"- `{endpoint.get('path', '/')}` "
+ f"- **Source:** {endpoint.get('source', 'unknown')} "
+ f"- **Confidence:** {endpoint.get('confidence', 'low')}"
+ )
+ if endpoint.get("status_code") is not None:
+ line += f" - **Status:** {endpoint['status_code']}"
+ if endpoint.get("content_type"):
+ line += f" - **Content-Type:** {endpoint['content_type']}"
+ if endpoint.get("note"):
+ line += f" - **Note:** {endpoint['note']}"
+ return line
+
+
+def _write_endpoint_category(f, title, endpoints):
+ if not endpoints:
+ return
+ f.write(f"#### {title}\n\n")
+ if title == "Static Assets":
+ f.write(f"- **Total Static Assets:** {len(endpoints)}\n")
+ for endpoint in endpoints[:5]:
+ f.write(f"{_endpoint_line(endpoint)}\n")
+ if len(endpoints) > 5:
+ f.write(f"- {len(endpoints) - 5} additional static assets omitted from Markdown.\n")
+ f.write("\n")
+ return
+
+ for endpoint in endpoints[:50]:
+ f.write(f"{_endpoint_line(endpoint)}\n")
+ if len(endpoints) > 50:
+ f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
+ f.write("\n")
+
+
def _write_endpoint_discovery(f, endpoint_results):
f.write("## Endpoint Discovery\n\n")
if isinstance(endpoint_results, dict) and endpoint_results.get("error"):
@@ -92,20 +181,18 @@ def _write_endpoint_discovery(f, endpoint_results):
if not endpoints:
f.write("- No endpoints discovered.\n\n")
continue
- for endpoint in endpoints[:50]:
- line = (
- f"- `{endpoint.get('path', '/')}` "
- f"- **Source:** {endpoint.get('source', 'unknown')} "
- f"- **Confidence:** {endpoint.get('confidence', 'low')}"
- )
- if endpoint.get("status_code") is not None:
- line += f" - **Status:** {endpoint['status_code']}"
- if endpoint.get("content_type"):
- line += f" - **Content-Type:** {endpoint['content_type']}"
- f.write(f"{line}\n")
- if len(endpoints) > 50:
- f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
- f.write("\n")
+
+ categorized = {
+ "API-like Endpoints": [],
+ "Frontend Routes": [],
+ "Well-known / Probed Paths": [],
+ "Static Assets": [],
+ }
+ for endpoint in endpoints:
+ categorized[_endpoint_category(endpoint)].append(endpoint)
+
+ for title in ("API-like Endpoints", "Frontend Routes", "Well-known / Probed Paths", "Static Assets"):
+ _write_endpoint_category(f, title, categorized[title])
f.write("---\n\n")
diff --git a/activerecon/modules/risk_analysis.py b/activerecon/modules/risk_analysis.py
index 1b916f7..1442579 100644
--- a/activerecon/modules/risk_analysis.py
+++ b/activerecon/modules/risk_analysis.py
@@ -1,5 +1,6 @@
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
+from urllib.parse import urlparse
SECURITY_HEADERS = {
@@ -98,6 +99,33 @@ def _is_admin_debug_docs_path(path):
return any(token in lower_path for token in ("/admin", "/debug", "/swagger", "/api-docs"))
+def _url_origin(url):
+ parsed = urlparse(str(url or ""))
+ if not parsed.scheme or not parsed.netloc:
+ return ""
+ return f"{parsed.scheme}://{parsed.netloc}"
+
+
+def _endpoint_evidence(base_url, path):
+ if not base_url:
+ return str(path or "")
+ if not path:
+ return str(base_url)
+ return f"{str(base_url).rstrip('/')}/{str(path).lstrip('/')}"
+
+
+def _http_header_path_evidence(item, path):
+ origin = _url_origin(item.get("final_url") or item.get("url"))
+ return _endpoint_evidence(origin, path) if origin else str(path or "")
+
+
+def _header_name_from_source(source):
+ prefix, separator, header_name = str(source or "").partition(":")
+ if prefix == "response-header" and separator and header_name:
+ return header_name
+ return "response header"
+
+
def generate_attention_findings(results, now=None):
findings = []
seen_endpoint_signals = set()
@@ -142,13 +170,11 @@ def add_endpoint_signal(severity, category, message, evidence):
path = _first_path_like_header_value(value)
if path:
- findings.append(
- _finding(
- "info",
- "endpoint",
- f"Interesting path found in response header {header_name}",
- path,
- )
+ add_endpoint_signal(
+ "info",
+ "endpoint",
+ f"Interesting path found in response header {header_name}",
+ _http_header_path_evidence(item, path),
)
if item.get("redirect_chain"):
@@ -157,23 +183,29 @@ def add_endpoint_signal(severity, category, message, evidence):
for base_url, endpoint in _endpoint_items(results):
path = endpoint.get("path", "")
source = endpoint.get("source", "")
- evidence = f"{base_url}{path}" if base_url else path
+ evidence = _endpoint_evidence(base_url, path)
if path == "/robots.txt" and endpoint.get("status_code") is not None:
- add_endpoint_signal("info", "endpoint", "robots.txt found", evidence)
+ add_endpoint_signal("info", "endpoint", "robots.txt found; follow-up recommended", evidence)
if source == "robots.txt":
- add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths", evidence)
+ add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths; follow-up recommended", evidence)
if source.startswith("response-header"):
- add_endpoint_signal("info", "endpoint", "Interesting endpoint from response header", evidence)
+ header_name = _header_name_from_source(source)
+ add_endpoint_signal(
+ "info",
+ "endpoint",
+ f"Interesting path found in response header {header_name}",
+ evidence,
+ )
if _is_api_like_path(path):
if source.startswith("javascript"):
- add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like paths", evidence)
+ add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like path candidate", evidence)
else:
- add_endpoint_signal("info", "endpoint", "API-like endpoint discovered", evidence)
+ add_endpoint_signal("info", "endpoint", "API-like endpoint discovered; follow-up recommended", evidence)
if _is_admin_debug_docs_path(path):
- add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered", evidence)
+ add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered; follow-up recommended", evidence)
if str(path).lower() == "/ftp":
- add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered", evidence)
+ add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered; follow-up recommended", evidence)
now = now or datetime.now(timezone.utc)
comparable_now = now.replace(tzinfo=None)
diff --git a/tests/test_endpoint_discovery.py b/tests/test_endpoint_discovery.py
index 9515ddf..e265bc0 100644
--- a/tests/test_endpoint_discovery.py
+++ b/tests/test_endpoint_discovery.py
@@ -18,7 +18,10 @@ def fake_get(url, timeout):
return Response(
headers={"Content-Type": "text/html"},
text="""
+ Juice Shop
login
+
+
@@ -33,7 +36,12 @@ def fake_get(url, timeout):
if url == "http://example.com:3000/api":
return Response(headers={"Content-Type": "application/json"}, text="{}", url=url)
if url == "http://example.com:3000/admin":
- return Response(status_code=403, headers={"Content-Type": "text/html"}, text="", url=url)
+ return Response(
+ status_code=200,
+ headers={"Content-Type": "text/html"},
+ text="Juice Shop",
+ url=url,
+ )
return Response(status_code=404, headers={"Content-Type": "text/plain"}, text="", url=url)
monkeypatch.setattr(endpoint_discovery.requests, "get", fake_get)
@@ -61,6 +69,9 @@ def fake_get(url, timeout):
assert results[0]["base_url"] == "http://example.com:3000"
assert endpoints["/#/jobs"]["source"] == "response-header:X-Recruiting"
assert endpoints["/login"]["source"] == "html:href"
+ assert endpoints["/style.css"]["source"] == "html:stylesheet"
+ assert endpoints["/favicon.ico"]["source"] == "html:icon"
+ assert endpoints["/app.js"]["source"] == "html:script-src"
assert endpoints["/submit"]["source"] == "html:form-action"
assert endpoints["/rest/products"]["source"] == "html-string"
assert endpoints["/api/orders"]["source"] == "javascript"
@@ -68,7 +79,8 @@ def fake_get(url, timeout):
assert endpoints["/robots.txt"]["content_type"] == "text/plain"
assert endpoints["/hidden"]["source"] == "robots.txt"
assert endpoints["/api"]["status_code"] == 200
- assert endpoints["/admin"]["status_code"] == 403
+ assert endpoints["/admin"]["status_code"] == 200
+ assert endpoints["/admin"]["note"] == "Possible SPA fallback route"
assert not any("cdn.example.net" in url for url, timeout in calls)
diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py
index 836463e..4bad755 100644
--- a/tests/test_report_generator.py
+++ b/tests/test_report_generator.py
@@ -52,13 +52,36 @@ def test_generate_report_writes_nested_results(tmp_path):
},
"Endpoint Discovery": [{
"base_url": "http://example.com",
- "endpoints": [{
- "path": "/api",
- "source": "well-known",
- "confidence": "medium",
- "status_code": 200,
- "content_type": "application/json",
- }],
+ "endpoints": [
+ {
+ "path": "/api",
+ "source": "well-known",
+ "confidence": "medium",
+ "status_code": 200,
+ "content_type": "application/json",
+ },
+ {
+ "path": "/login",
+ "source": "html:href",
+ "confidence": "medium",
+ "status_code": 200,
+ "content_type": "text/html",
+ "note": "Possible SPA fallback route",
+ },
+ {
+ "path": "/robots.txt",
+ "source": "well-known",
+ "confidence": "medium",
+ "status_code": 200,
+ "content_type": "text/plain",
+ },
+ {"path": "/app.js", "source": "html:script-src", "confidence": "medium"},
+ {"path": "/style.css", "source": "html:stylesheet", "confidence": "medium"},
+ {"path": "/favicon.ico", "source": "html:icon", "confidence": "medium"},
+ {"path": "/assets/logo.png", "source": "html:src", "confidence": "medium"},
+ {"path": "/assets/chunk-1.js", "source": "html:script-src", "confidence": "medium"},
+ {"path": "/assets/extra.css", "source": "html:stylesheet", "confidence": "medium"},
+ ],
}],
"Attention": [{
"severity": "low",
@@ -99,7 +122,19 @@ def test_generate_report_writes_nested_results(tmp_path):
assert " - `Server`: test" in content
assert "## Endpoint Discovery" in content
assert "### http://example.com" in content
+ assert "#### API-like Endpoints" in content
+ assert "#### Frontend Routes" in content
+ assert "#### Well-known / Probed Paths" in content
+ assert "#### Static Assets" in content
assert "`/api` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** application/json" in content
+ assert "`/login` - **Source:** html:href - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/html - **Note:** Possible SPA fallback route" in content
+ assert "`/robots.txt` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/plain" in content
+ assert "- **Total Static Assets:** 6" in content
+ assert "`/app.js` - **Source:** html:script-src" in content
+ assert "`/style.css` - **Source:** html:stylesheet" in content
+ assert "`/favicon.ico` - **Source:** html:icon" in content
+ assert "/assets/extra.css" not in content
+ assert "1 additional static assets omitted from Markdown." in content
assert "## DNS Analysis" in content
assert "## TLS Analysis" in content
assert "TLSv1.3" in content
diff --git a/tests/test_risk_analysis.py b/tests/test_risk_analysis.py
index 4cd4839..230435f 100644
--- a/tests/test_risk_analysis.py
+++ b/tests/test_risk_analysis.py
@@ -124,13 +124,40 @@ def test_generate_attention_findings_reports_cors_and_header_paths_as_info():
assert cors_finding["evidence"] == "http://example.com"
assert endpoint_finding["severity"] == "info"
assert endpoint_finding["message"] == "Interesting path found in response header X-Recruiting"
- assert endpoint_finding["evidence"] == "/#/jobs"
+ assert endpoint_finding["evidence"] == "http://example.com/#/jobs"
assert technology_finding["severity"] == "info"
assert technology_finding["message"] == "X-Powered-By header exposed"
assert technology_finding["evidence"] == "Express - http://example.com"
assert all(item.get("evidence") != "no path here" for item in findings)
+def test_generate_attention_findings_deduplicates_response_header_endpoint_signals():
+ results = {
+ "HTTP Analysis": [{
+ "url": "http://example.com",
+ "headers": {"X-Recruiting": "/#/jobs"},
+ }],
+ "Endpoint Discovery": [{
+ "base_url": "http://example.com",
+ "endpoints": [
+ {"path": "/#/jobs", "source": "response-header:X-Recruiting"},
+ ],
+ }],
+ }
+
+ findings = generate_attention_findings(results)
+ messages = [item["message"] for item in findings]
+ header_findings = [
+ item
+ for item in findings
+ if item["message"] == "Interesting path found in response header X-Recruiting"
+ ]
+
+ assert len(header_findings) == 1
+ assert header_findings[0]["evidence"] == "http://example.com/#/jobs"
+ assert "Interesting endpoint from response header" not in messages
+
+
def test_generate_attention_findings_reports_endpoint_discovery_signals():
results = {
"Endpoint Discovery": [{
@@ -150,13 +177,13 @@ def test_generate_attention_findings_reports_endpoint_discovery_signals():
findings = generate_attention_findings(results)
messages = [item["message"] for item in findings]
- assert "API-like endpoint discovered" in messages
- assert "robots.txt found" in messages
- assert "robots.txt contains Disallow paths" in messages
- assert "Interesting endpoint from response header" in messages
- assert "JavaScript exposes API-like paths" in messages
- assert "Possible admin/debug/docs route discovered" in messages
- assert "/ftp endpoint discovered" in messages
+ assert "API-like endpoint discovered; follow-up recommended" in messages
+ assert "robots.txt found; follow-up recommended" in messages
+ assert "robots.txt contains Disallow paths; follow-up recommended" in messages
+ assert "Interesting path found in response header X-Recruiting" in messages
+ assert "JavaScript exposes API-like path candidate" in messages
+ assert "Possible admin/debug/docs route discovered; follow-up recommended" in messages
+ assert "/ftp endpoint discovered; follow-up recommended" in messages
def test_generate_attention_findings_reports_expired_tls_certificates():