From ae2fa2ecd2a09fc4b7494f472087b7244690da79 Mon Sep 17 00:00:00 2001 From: CamiloCod3 Date: Wed, 17 Jun 2026 15:18:43 +0200 Subject: [PATCH] Improve endpoint discovery report readability --- activerecon/modules/endpoint_discovery.py | 39 +++++++- activerecon/modules/report_generator.py | 115 +++++++++++++++++++--- activerecon/modules/risk_analysis.py | 62 +++++++++--- tests/test_endpoint_discovery.py | 16 ++- tests/test_report_generator.py | 49 +++++++-- tests/test_risk_analysis.py | 43 ++++++-- 6 files changed, 273 insertions(+), 51 deletions(-) diff --git a/activerecon/modules/endpoint_discovery.py b/activerecon/modules/endpoint_discovery.py index d519fc8..9b2eb00 100644 --- a/activerecon/modules/endpoint_discovery.py +++ b/activerecon/modules/endpoint_discovery.py @@ -22,6 +22,7 @@ DEFAULT_ENDPOINT_LIMIT = 50 DEFAULT_HTTP_TIMEOUT = 5 PATH_STRING_RE = re.compile(r"""["'`](/[A-Za-z0-9._~:/?#\[\]@!$&()*+,;=%-]{1,200})["'`]""") +TITLE_RE = re.compile(r"]*>(.*?)", flags=re.IGNORECASE | re.DOTALL) class EndpointHTMLParser(HTMLParser): @@ -34,13 +35,21 @@ def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if "href" in attrs_dict: - self.links.append((attrs_dict["href"], "html:href")) + rel = str(attrs_dict.get("rel", "")).lower() + if tag == "link" and "stylesheet" in rel: + self.links.append((attrs_dict["href"], "html:stylesheet")) + elif tag == "link" and "icon" in rel: + self.links.append((attrs_dict["href"], "html:icon")) + else: + self.links.append((attrs_dict["href"], "html:href")) + if tag == "script" and attrs_dict.get("src"): + self.links.append((attrs_dict["src"], "html:script-src")) + self.script_srcs.append(attrs_dict["src"]) if "src" in attrs_dict: - self.links.append((attrs_dict["src"], "html:src")) + if tag != "script": + self.links.append((attrs_dict["src"], "html:src")) if tag == "form" and attrs_dict.get("action"): self.links.append((attrs_dict["action"], "html:form-action")) - if tag == "script" and attrs_dict.get("src"): - self.script_srcs.append(attrs_dict["src"]) def _web_recon_settings(config): @@ -124,7 +133,7 @@ def _confidence(source): return "low" -def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None): +def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None, note=None): if not path or not path.startswith("/") or path.startswith("//") or len(path) > 250: return if path not in endpoints and len(endpoints) >= limit: @@ -141,6 +150,8 @@ def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type endpoints[path]["status_code"] = status_code if content_type: endpoints[path]["content_type"] = content_type + if note: + endpoints[path]["note"] = note def _extract_paths_from_text(text): @@ -175,6 +186,13 @@ def _content_type(response): return response.headers.get("Content-Type") or response.headers.get("content-type") or "" +def _title_from_html(text): + match = TITLE_RE.search(text or "") + if not match: + return None + return re.sub(r"\s+", " ", match.group(1)).strip() + + def _is_found_probe(response): if response is None: return False @@ -244,8 +262,10 @@ def get_if_allowed(url): _add_endpoint(endpoints, path, f"response-header:{header_name}", endpoint_limit) page_response = get_if_allowed(base_url) + root_title = None if page_response is not None and page_response.status_code < 400 and "html" in _content_type(page_response).lower(): html_text = getattr(page_response, "text", "")[:200000] + root_title = _title_from_html(html_text) parser = _parse_html(html_text) for raw_link, source in parser.links: path = _normalize_candidate(raw_link, base_url, same_origin_only) @@ -272,6 +292,14 @@ def get_if_allowed(url): response = get_if_allowed(urljoin(base_origin, path)) if not _is_found_probe(response): continue + note = None + if ( + response.status_code == 200 + and root_title + and "html" in _content_type(response).lower() + and _title_from_html(getattr(response, "text", "")[:200000]) == root_title + ): + note = "Possible SPA fallback route" _add_endpoint( endpoints, path, @@ -279,6 +307,7 @@ def get_if_allowed(url): endpoint_limit, status_code=response.status_code, content_type=_content_type(response), + note=note, ) if path == "/robots.txt" and response.status_code < 400: for disallow_path in _robots_disallow_paths(getattr(response, "text", "")): diff --git a/activerecon/modules/report_generator.py b/activerecon/modules/report_generator.py index c225713..7fecf17 100644 --- a/activerecon/modules/report_generator.py +++ b/activerecon/modules/report_generator.py @@ -1,5 +1,6 @@ import ipaddress import logging +from pathlib import PurePosixPath from pathlib import Path @@ -73,6 +74,94 @@ def _write_http_result(f, item): f.write(f" - `{key}`: {value}\n") +STATIC_ASSET_EXTENSIONS = { + ".css", + ".eot", + ".gif", + ".ico", + ".jpeg", + ".jpg", + ".js", + ".map", + ".png", + ".svg", + ".ttf", + ".webp", + ".woff", + ".woff2", +} +WELL_KNOWN_REPORT_PATHS = { + "/robots.txt", + "/sitemap.xml", + "/.well-known/security.txt", + "/swagger", + "/api-docs", + "/ftp", +} + + +def _path_without_query(path): + return str(path or "/").split("?", 1)[0].split("#", 1)[0] + + +def _is_api_like_endpoint(path): + lower_path = str(path or "").lower() + return lower_path == "/api" or lower_path == "/rest" or lower_path.startswith("/api/") or lower_path.startswith("/rest/") + + +def _is_static_asset(path): + clean_path = _path_without_query(path).lower() + filename = PurePosixPath(clean_path).name + return PurePosixPath(clean_path).suffix in STATIC_ASSET_EXTENSIONS or "chunk" in filename + + +def _endpoint_category(endpoint): + path = endpoint.get("path", "") + lower_path = str(path).lower() + if _is_static_asset(path): + return "Static Assets" + if _is_api_like_endpoint(path): + return "API-like Endpoints" + if lower_path in WELL_KNOWN_REPORT_PATHS: + return "Well-known / Probed Paths" + return "Frontend Routes" + + +def _endpoint_line(endpoint): + line = ( + f"- `{endpoint.get('path', '/')}` " + f"- **Source:** {endpoint.get('source', 'unknown')} " + f"- **Confidence:** {endpoint.get('confidence', 'low')}" + ) + if endpoint.get("status_code") is not None: + line += f" - **Status:** {endpoint['status_code']}" + if endpoint.get("content_type"): + line += f" - **Content-Type:** {endpoint['content_type']}" + if endpoint.get("note"): + line += f" - **Note:** {endpoint['note']}" + return line + + +def _write_endpoint_category(f, title, endpoints): + if not endpoints: + return + f.write(f"#### {title}\n\n") + if title == "Static Assets": + f.write(f"- **Total Static Assets:** {len(endpoints)}\n") + for endpoint in endpoints[:5]: + f.write(f"{_endpoint_line(endpoint)}\n") + if len(endpoints) > 5: + f.write(f"- {len(endpoints) - 5} additional static assets omitted from Markdown.\n") + f.write("\n") + return + + for endpoint in endpoints[:50]: + f.write(f"{_endpoint_line(endpoint)}\n") + if len(endpoints) > 50: + f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n") + f.write("\n") + + def _write_endpoint_discovery(f, endpoint_results): f.write("## Endpoint Discovery\n\n") if isinstance(endpoint_results, dict) and endpoint_results.get("error"): @@ -92,20 +181,18 @@ def _write_endpoint_discovery(f, endpoint_results): if not endpoints: f.write("- No endpoints discovered.\n\n") continue - for endpoint in endpoints[:50]: - line = ( - f"- `{endpoint.get('path', '/')}` " - f"- **Source:** {endpoint.get('source', 'unknown')} " - f"- **Confidence:** {endpoint.get('confidence', 'low')}" - ) - if endpoint.get("status_code") is not None: - line += f" - **Status:** {endpoint['status_code']}" - if endpoint.get("content_type"): - line += f" - **Content-Type:** {endpoint['content_type']}" - f.write(f"{line}\n") - if len(endpoints) > 50: - f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n") - f.write("\n") + + categorized = { + "API-like Endpoints": [], + "Frontend Routes": [], + "Well-known / Probed Paths": [], + "Static Assets": [], + } + for endpoint in endpoints: + categorized[_endpoint_category(endpoint)].append(endpoint) + + for title in ("API-like Endpoints", "Frontend Routes", "Well-known / Probed Paths", "Static Assets"): + _write_endpoint_category(f, title, categorized[title]) f.write("---\n\n") diff --git a/activerecon/modules/risk_analysis.py b/activerecon/modules/risk_analysis.py index 1b916f7..1442579 100644 --- a/activerecon/modules/risk_analysis.py +++ b/activerecon/modules/risk_analysis.py @@ -1,5 +1,6 @@ from datetime import datetime, timezone from email.utils import parsedate_to_datetime +from urllib.parse import urlparse SECURITY_HEADERS = { @@ -98,6 +99,33 @@ def _is_admin_debug_docs_path(path): return any(token in lower_path for token in ("/admin", "/debug", "/swagger", "/api-docs")) +def _url_origin(url): + parsed = urlparse(str(url or "")) + if not parsed.scheme or not parsed.netloc: + return "" + return f"{parsed.scheme}://{parsed.netloc}" + + +def _endpoint_evidence(base_url, path): + if not base_url: + return str(path or "") + if not path: + return str(base_url) + return f"{str(base_url).rstrip('/')}/{str(path).lstrip('/')}" + + +def _http_header_path_evidence(item, path): + origin = _url_origin(item.get("final_url") or item.get("url")) + return _endpoint_evidence(origin, path) if origin else str(path or "") + + +def _header_name_from_source(source): + prefix, separator, header_name = str(source or "").partition(":") + if prefix == "response-header" and separator and header_name: + return header_name + return "response header" + + def generate_attention_findings(results, now=None): findings = [] seen_endpoint_signals = set() @@ -142,13 +170,11 @@ def add_endpoint_signal(severity, category, message, evidence): path = _first_path_like_header_value(value) if path: - findings.append( - _finding( - "info", - "endpoint", - f"Interesting path found in response header {header_name}", - path, - ) + add_endpoint_signal( + "info", + "endpoint", + f"Interesting path found in response header {header_name}", + _http_header_path_evidence(item, path), ) if item.get("redirect_chain"): @@ -157,23 +183,29 @@ def add_endpoint_signal(severity, category, message, evidence): for base_url, endpoint in _endpoint_items(results): path = endpoint.get("path", "") source = endpoint.get("source", "") - evidence = f"{base_url}{path}" if base_url else path + evidence = _endpoint_evidence(base_url, path) if path == "/robots.txt" and endpoint.get("status_code") is not None: - add_endpoint_signal("info", "endpoint", "robots.txt found", evidence) + add_endpoint_signal("info", "endpoint", "robots.txt found; follow-up recommended", evidence) if source == "robots.txt": - add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths", evidence) + add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths; follow-up recommended", evidence) if source.startswith("response-header"): - add_endpoint_signal("info", "endpoint", "Interesting endpoint from response header", evidence) + header_name = _header_name_from_source(source) + add_endpoint_signal( + "info", + "endpoint", + f"Interesting path found in response header {header_name}", + evidence, + ) if _is_api_like_path(path): if source.startswith("javascript"): - add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like paths", evidence) + add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like path candidate", evidence) else: - add_endpoint_signal("info", "endpoint", "API-like endpoint discovered", evidence) + add_endpoint_signal("info", "endpoint", "API-like endpoint discovered; follow-up recommended", evidence) if _is_admin_debug_docs_path(path): - add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered", evidence) + add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered; follow-up recommended", evidence) if str(path).lower() == "/ftp": - add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered", evidence) + add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered; follow-up recommended", evidence) now = now or datetime.now(timezone.utc) comparable_now = now.replace(tzinfo=None) diff --git a/tests/test_endpoint_discovery.py b/tests/test_endpoint_discovery.py index 9515ddf..e265bc0 100644 --- a/tests/test_endpoint_discovery.py +++ b/tests/test_endpoint_discovery.py @@ -18,7 +18,10 @@ def fake_get(url, timeout): return Response( headers={"Content-Type": "text/html"}, text=""" + Juice Shop login + +
@@ -33,7 +36,12 @@ def fake_get(url, timeout): if url == "http://example.com:3000/api": return Response(headers={"Content-Type": "application/json"}, text="{}", url=url) if url == "http://example.com:3000/admin": - return Response(status_code=403, headers={"Content-Type": "text/html"}, text="", url=url) + return Response( + status_code=200, + headers={"Content-Type": "text/html"}, + text="Juice Shop
", + url=url, + ) return Response(status_code=404, headers={"Content-Type": "text/plain"}, text="", url=url) monkeypatch.setattr(endpoint_discovery.requests, "get", fake_get) @@ -61,6 +69,9 @@ def fake_get(url, timeout): assert results[0]["base_url"] == "http://example.com:3000" assert endpoints["/#/jobs"]["source"] == "response-header:X-Recruiting" assert endpoints["/login"]["source"] == "html:href" + assert endpoints["/style.css"]["source"] == "html:stylesheet" + assert endpoints["/favicon.ico"]["source"] == "html:icon" + assert endpoints["/app.js"]["source"] == "html:script-src" assert endpoints["/submit"]["source"] == "html:form-action" assert endpoints["/rest/products"]["source"] == "html-string" assert endpoints["/api/orders"]["source"] == "javascript" @@ -68,7 +79,8 @@ def fake_get(url, timeout): assert endpoints["/robots.txt"]["content_type"] == "text/plain" assert endpoints["/hidden"]["source"] == "robots.txt" assert endpoints["/api"]["status_code"] == 200 - assert endpoints["/admin"]["status_code"] == 403 + assert endpoints["/admin"]["status_code"] == 200 + assert endpoints["/admin"]["note"] == "Possible SPA fallback route" assert not any("cdn.example.net" in url for url, timeout in calls) diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py index 836463e..4bad755 100644 --- a/tests/test_report_generator.py +++ b/tests/test_report_generator.py @@ -52,13 +52,36 @@ def test_generate_report_writes_nested_results(tmp_path): }, "Endpoint Discovery": [{ "base_url": "http://example.com", - "endpoints": [{ - "path": "/api", - "source": "well-known", - "confidence": "medium", - "status_code": 200, - "content_type": "application/json", - }], + "endpoints": [ + { + "path": "/api", + "source": "well-known", + "confidence": "medium", + "status_code": 200, + "content_type": "application/json", + }, + { + "path": "/login", + "source": "html:href", + "confidence": "medium", + "status_code": 200, + "content_type": "text/html", + "note": "Possible SPA fallback route", + }, + { + "path": "/robots.txt", + "source": "well-known", + "confidence": "medium", + "status_code": 200, + "content_type": "text/plain", + }, + {"path": "/app.js", "source": "html:script-src", "confidence": "medium"}, + {"path": "/style.css", "source": "html:stylesheet", "confidence": "medium"}, + {"path": "/favicon.ico", "source": "html:icon", "confidence": "medium"}, + {"path": "/assets/logo.png", "source": "html:src", "confidence": "medium"}, + {"path": "/assets/chunk-1.js", "source": "html:script-src", "confidence": "medium"}, + {"path": "/assets/extra.css", "source": "html:stylesheet", "confidence": "medium"}, + ], }], "Attention": [{ "severity": "low", @@ -99,7 +122,19 @@ def test_generate_report_writes_nested_results(tmp_path): assert " - `Server`: test" in content assert "## Endpoint Discovery" in content assert "### http://example.com" in content + assert "#### API-like Endpoints" in content + assert "#### Frontend Routes" in content + assert "#### Well-known / Probed Paths" in content + assert "#### Static Assets" in content assert "`/api` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** application/json" in content + assert "`/login` - **Source:** html:href - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/html - **Note:** Possible SPA fallback route" in content + assert "`/robots.txt` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/plain" in content + assert "- **Total Static Assets:** 6" in content + assert "`/app.js` - **Source:** html:script-src" in content + assert "`/style.css` - **Source:** html:stylesheet" in content + assert "`/favicon.ico` - **Source:** html:icon" in content + assert "/assets/extra.css" not in content + assert "1 additional static assets omitted from Markdown." in content assert "## DNS Analysis" in content assert "## TLS Analysis" in content assert "TLSv1.3" in content diff --git a/tests/test_risk_analysis.py b/tests/test_risk_analysis.py index 4cd4839..230435f 100644 --- a/tests/test_risk_analysis.py +++ b/tests/test_risk_analysis.py @@ -124,13 +124,40 @@ def test_generate_attention_findings_reports_cors_and_header_paths_as_info(): assert cors_finding["evidence"] == "http://example.com" assert endpoint_finding["severity"] == "info" assert endpoint_finding["message"] == "Interesting path found in response header X-Recruiting" - assert endpoint_finding["evidence"] == "/#/jobs" + assert endpoint_finding["evidence"] == "http://example.com/#/jobs" assert technology_finding["severity"] == "info" assert technology_finding["message"] == "X-Powered-By header exposed" assert technology_finding["evidence"] == "Express - http://example.com" assert all(item.get("evidence") != "no path here" for item in findings) +def test_generate_attention_findings_deduplicates_response_header_endpoint_signals(): + results = { + "HTTP Analysis": [{ + "url": "http://example.com", + "headers": {"X-Recruiting": "/#/jobs"}, + }], + "Endpoint Discovery": [{ + "base_url": "http://example.com", + "endpoints": [ + {"path": "/#/jobs", "source": "response-header:X-Recruiting"}, + ], + }], + } + + findings = generate_attention_findings(results) + messages = [item["message"] for item in findings] + header_findings = [ + item + for item in findings + if item["message"] == "Interesting path found in response header X-Recruiting" + ] + + assert len(header_findings) == 1 + assert header_findings[0]["evidence"] == "http://example.com/#/jobs" + assert "Interesting endpoint from response header" not in messages + + def test_generate_attention_findings_reports_endpoint_discovery_signals(): results = { "Endpoint Discovery": [{ @@ -150,13 +177,13 @@ def test_generate_attention_findings_reports_endpoint_discovery_signals(): findings = generate_attention_findings(results) messages = [item["message"] for item in findings] - assert "API-like endpoint discovered" in messages - assert "robots.txt found" in messages - assert "robots.txt contains Disallow paths" in messages - assert "Interesting endpoint from response header" in messages - assert "JavaScript exposes API-like paths" in messages - assert "Possible admin/debug/docs route discovered" in messages - assert "/ftp endpoint discovered" in messages + assert "API-like endpoint discovered; follow-up recommended" in messages + assert "robots.txt found; follow-up recommended" in messages + assert "robots.txt contains Disallow paths; follow-up recommended" in messages + assert "Interesting path found in response header X-Recruiting" in messages + assert "JavaScript exposes API-like path candidate" in messages + assert "Possible admin/debug/docs route discovered; follow-up recommended" in messages + assert "/ftp endpoint discovered; follow-up recommended" in messages def test_generate_attention_findings_reports_expired_tls_certificates():