From ae2fa2ecd2a09fc4b7494f472087b7244690da79 Mon Sep 17 00:00:00 2001
From: CamiloCod3 <deleon88198@gmail.com>
Date: Wed, 17 Jun 2026 15:18:43 +0200
Subject: [PATCH] Improve endpoint discovery report readability

---
 activerecon/modules/endpoint_discovery.py |  39 +++++++-
 activerecon/modules/report_generator.py   | 115 +++++++++++++++++++---
 activerecon/modules/risk_analysis.py      |  62 +++++++++---
 tests/test_endpoint_discovery.py          |  16 ++-
 tests/test_report_generator.py            |  49 +++++++--
 tests/test_risk_analysis.py               |  43 ++++++--
 6 files changed, 273 insertions(+), 51 deletions(-)
diff --git a/activerecon/modules/endpoint_discovery.py b/activerecon/modules/endpoint_discovery.py
index d519fc8..9b2eb00 100644
--- a/activerecon/modules/endpoint_discovery.py
+++ b/activerecon/modules/endpoint_discovery.py
@@ -22,6 +22,7 @@
 DEFAULT_ENDPOINT_LIMIT = 50
 DEFAULT_HTTP_TIMEOUT = 5
 PATH_STRING_RE = re.compile(r"""["'`](/[A-Za-z0-9._~:/?#\[\]@!$&()*+,;=%-]{1,200})["'`]""")
+TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", flags=re.IGNORECASE | re.DOTALL)
 
 
 class EndpointHTMLParser(HTMLParser):
@@ -34,13 +35,21 @@ def handle_starttag(self, tag, attrs):
         attrs_dict = dict(attrs)
 
         if "href" in attrs_dict:
-            self.links.append((attrs_dict["href"], "html:href"))
+            rel = str(attrs_dict.get("rel", "")).lower()
+            if tag == "link" and "stylesheet" in rel:
+                self.links.append((attrs_dict["href"], "html:stylesheet"))
+            elif tag == "link" and "icon" in rel:
+                self.links.append((attrs_dict["href"], "html:icon"))
+            else:
+                self.links.append((attrs_dict["href"], "html:href"))
+        if tag == "script" and attrs_dict.get("src"):
+            self.links.append((attrs_dict["src"], "html:script-src"))
+            self.script_srcs.append(attrs_dict["src"])
         if "src" in attrs_dict:
-            self.links.append((attrs_dict["src"], "html:src"))
+            if tag != "script":
+                self.links.append((attrs_dict["src"], "html:src"))
         if tag == "form" and attrs_dict.get("action"):
             self.links.append((attrs_dict["action"], "html:form-action"))
-        if tag == "script" and attrs_dict.get("src"):
-            self.script_srcs.append(attrs_dict["src"])
 
 
 def _web_recon_settings(config):
@@ -124,7 +133,7 @@ def _confidence(source):
     return "low"
 
 
-def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None):
+def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None, note=None):
     if not path or not path.startswith("/") or path.startswith("//") or len(path) > 250:
         return
     if path not in endpoints and len(endpoints) >= limit:
@@ -141,6 +150,8 @@ def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type
         endpoints[path]["status_code"] = status_code
     if content_type:
         endpoints[path]["content_type"] = content_type
+    if note:
+        endpoints[path]["note"] = note
 
 
 def _extract_paths_from_text(text):
@@ -175,6 +186,13 @@ def _content_type(response):
     return response.headers.get("Content-Type") or response.headers.get("content-type") or ""
 
 
+def _title_from_html(text):
+    match = TITLE_RE.search(text or "")
+    if not match:
+        return None
+    return re.sub(r"\s+", " ", match.group(1)).strip()
+
+
 def _is_found_probe(response):
     if response is None:
         return False
@@ -244,8 +262,10 @@ def get_if_allowed(url):
                 _add_endpoint(endpoints, path, f"response-header:{header_name}", endpoint_limit)
 
         page_response = get_if_allowed(base_url)
+        root_title = None
         if page_response is not None and page_response.status_code < 400 and "html" in _content_type(page_response).lower():
             html_text = getattr(page_response, "text", "")[:200000]
+            root_title = _title_from_html(html_text)
             parser = _parse_html(html_text)
             for raw_link, source in parser.links:
                 path = _normalize_candidate(raw_link, base_url, same_origin_only)
@@ -272,6 +292,14 @@ def get_if_allowed(url):
             response = get_if_allowed(urljoin(base_origin, path))
             if not _is_found_probe(response):
                 continue
+            note = None
+            if (
+                response.status_code == 200
+                and root_title
+                and "html" in _content_type(response).lower()
+                and _title_from_html(getattr(response, "text", "")[:200000]) == root_title
+            ):
+                note = "Possible SPA fallback route"
             _add_endpoint(
                 endpoints,
                 path,
@@ -279,6 +307,7 @@ def get_if_allowed(url):
                 endpoint_limit,
                 status_code=response.status_code,
                 content_type=_content_type(response),
+                note=note,
             )
             if path == "/robots.txt" and response.status_code < 400:
                 for disallow_path in _robots_disallow_paths(getattr(response, "text", "")):
diff --git a/activerecon/modules/report_generator.py b/activerecon/modules/report_generator.py
index c225713..7fecf17 100644
--- a/activerecon/modules/report_generator.py
+++ b/activerecon/modules/report_generator.py
@@ -1,5 +1,6 @@
 import ipaddress
 import logging
+from pathlib import PurePosixPath
 from pathlib import Path
 
 
@@ -73,6 +74,94 @@ def _write_http_result(f, item):
             f.write(f"  - `{key}`: {value}\n")
 
 
+STATIC_ASSET_EXTENSIONS = {
+    ".css",
+    ".eot",
+    ".gif",
+    ".ico",
+    ".jpeg",
+    ".jpg",
+    ".js",
+    ".map",
+    ".png",
+    ".svg",
+    ".ttf",
+    ".webp",
+    ".woff",
+    ".woff2",
+}
+WELL_KNOWN_REPORT_PATHS = {
+    "/robots.txt",
+    "/sitemap.xml",
+    "/.well-known/security.txt",
+    "/swagger",
+    "/api-docs",
+    "/ftp",
+}
+
+
+def _path_without_query(path):
+    return str(path or "/").split("?", 1)[0].split("#", 1)[0]
+
+
+def _is_api_like_endpoint(path):
+    lower_path = str(path or "").lower()
+    return lower_path == "/api" or lower_path == "/rest" or lower_path.startswith("/api/") or lower_path.startswith("/rest/")
+
+
+def _is_static_asset(path):
+    clean_path = _path_without_query(path).lower()
+    filename = PurePosixPath(clean_path).name
+    return PurePosixPath(clean_path).suffix in STATIC_ASSET_EXTENSIONS or "chunk" in filename
+
+
+def _endpoint_category(endpoint):
+    path = endpoint.get("path", "")
+    lower_path = str(path).lower()
+    if _is_static_asset(path):
+        return "Static Assets"
+    if _is_api_like_endpoint(path):
+        return "API-like Endpoints"
+    if lower_path in WELL_KNOWN_REPORT_PATHS:
+        return "Well-known / Probed Paths"
+    return "Frontend Routes"
+
+
+def _endpoint_line(endpoint):
+    line = (
+        f"- `{endpoint.get('path', '/')}` "
+        f"- **Source:** {endpoint.get('source', 'unknown')} "
+        f"- **Confidence:** {endpoint.get('confidence', 'low')}"
+    )
+    if endpoint.get("status_code") is not None:
+        line += f" - **Status:** {endpoint['status_code']}"
+    if endpoint.get("content_type"):
+        line += f" - **Content-Type:** {endpoint['content_type']}"
+    if endpoint.get("note"):
+        line += f" - **Note:** {endpoint['note']}"
+    return line
+
+
+def _write_endpoint_category(f, title, endpoints):
+    if not endpoints:
+        return
+    f.write(f"#### {title}\n\n")
+    if title == "Static Assets":
+        f.write(f"- **Total Static Assets:** {len(endpoints)}\n")
+        for endpoint in endpoints[:5]:
+            f.write(f"{_endpoint_line(endpoint)}\n")
+        if len(endpoints) > 5:
+            f.write(f"- {len(endpoints) - 5} additional static assets omitted from Markdown.\n")
+        f.write("\n")
+        return
+
+    for endpoint in endpoints[:50]:
+        f.write(f"{_endpoint_line(endpoint)}\n")
+    if len(endpoints) > 50:
+        f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
+    f.write("\n")
+
+
 def _write_endpoint_discovery(f, endpoint_results):
     f.write("## Endpoint Discovery\n\n")
     if isinstance(endpoint_results, dict) and endpoint_results.get("error"):
@@ -92,20 +181,18 @@ def _write_endpoint_discovery(f, endpoint_results):
         if not endpoints:
             f.write("- No endpoints discovered.\n\n")
             continue
-        for endpoint in endpoints[:50]:
-            line = (
-                f"- `{endpoint.get('path', '/')}` "
-                f"- **Source:** {endpoint.get('source', 'unknown')} "
-                f"- **Confidence:** {endpoint.get('confidence', 'low')}"
-            )
-            if endpoint.get("status_code") is not None:
-                line += f" - **Status:** {endpoint['status_code']}"
-            if endpoint.get("content_type"):
-                line += f" - **Content-Type:** {endpoint['content_type']}"
-            f.write(f"{line}\n")
-        if len(endpoints) > 50:
-            f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
-        f.write("\n")
+
+        categorized = {
+            "API-like Endpoints": [],
+            "Frontend Routes": [],
+            "Well-known / Probed Paths": [],
+            "Static Assets": [],
+        }
+        for endpoint in endpoints:
+            categorized[_endpoint_category(endpoint)].append(endpoint)
+
+        for title in ("API-like Endpoints", "Frontend Routes", "Well-known / Probed Paths", "Static Assets"):
+            _write_endpoint_category(f, title, categorized[title])
 
     f.write("---\n\n")
 
diff --git a/activerecon/modules/risk_analysis.py b/activerecon/modules/risk_analysis.py
index 1b916f7..1442579 100644
--- a/activerecon/modules/risk_analysis.py
+++ b/activerecon/modules/risk_analysis.py
@@ -1,5 +1,6 @@
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
+from urllib.parse import urlparse
 
 
 SECURITY_HEADERS = {
@@ -98,6 +99,33 @@ def _is_admin_debug_docs_path(path):
     return any(token in lower_path for token in ("/admin", "/debug", "/swagger", "/api-docs"))
 
 
+def _url_origin(url):
+    parsed = urlparse(str(url or ""))
+    if not parsed.scheme or not parsed.netloc:
+        return ""
+    return f"{parsed.scheme}://{parsed.netloc}"
+
+
+def _endpoint_evidence(base_url, path):
+    if not base_url:
+        return str(path or "")
+    if not path:
+        return str(base_url)
+    return f"{str(base_url).rstrip('/')}/{str(path).lstrip('/')}"
+
+
+def _http_header_path_evidence(item, path):
+    origin = _url_origin(item.get("final_url") or item.get("url"))
+    return _endpoint_evidence(origin, path) if origin else str(path or "")
+
+
+def _header_name_from_source(source):
+    prefix, separator, header_name = str(source or "").partition(":")
+    if prefix == "response-header" and separator and header_name:
+        return header_name
+    return "response header"
+
+
 def generate_attention_findings(results, now=None):
     findings = []
     seen_endpoint_signals = set()
@@ -142,13 +170,11 @@ def add_endpoint_signal(severity, category, message, evidence):
 
             path = _first_path_like_header_value(value)
             if path:
-                findings.append(
-                    _finding(
-                        "info",
-                        "endpoint",
-                        f"Interesting path found in response header {header_name}",
-                        path,
-                    )
+                add_endpoint_signal(
+                    "info",
+                    "endpoint",
+                    f"Interesting path found in response header {header_name}",
+                    _http_header_path_evidence(item, path),
                 )
 
         if item.get("redirect_chain"):
@@ -157,23 +183,29 @@ def add_endpoint_signal(severity, category, message, evidence):
     for base_url, endpoint in _endpoint_items(results):
         path = endpoint.get("path", "")
         source = endpoint.get("source", "")
-        evidence = f"{base_url}{path}" if base_url else path
+        evidence = _endpoint_evidence(base_url, path)
 
         if path == "/robots.txt" and endpoint.get("status_code") is not None:
-            add_endpoint_signal("info", "endpoint", "robots.txt found", evidence)
+            add_endpoint_signal("info", "endpoint", "robots.txt found; follow-up recommended", evidence)
         if source == "robots.txt":
-            add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths", evidence)
+            add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths; follow-up recommended", evidence)
         if source.startswith("response-header"):
-            add_endpoint_signal("info", "endpoint", "Interesting endpoint from response header", evidence)
+            header_name = _header_name_from_source(source)
+            add_endpoint_signal(
+                "info",
+                "endpoint",
+                f"Interesting path found in response header {header_name}",
+                evidence,
+            )
         if _is_api_like_path(path):
             if source.startswith("javascript"):
-                add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like paths", evidence)
+                add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like path candidate", evidence)
             else:
-                add_endpoint_signal("info", "endpoint", "API-like endpoint discovered", evidence)
+                add_endpoint_signal("info", "endpoint", "API-like endpoint discovered; follow-up recommended", evidence)
         if _is_admin_debug_docs_path(path):
-            add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered", evidence)
+            add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered; follow-up recommended", evidence)
         if str(path).lower() == "/ftp":
-            add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered", evidence)
+            add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered; follow-up recommended", evidence)
 
     now = now or datetime.now(timezone.utc)
     comparable_now = now.replace(tzinfo=None)
diff --git a/tests/test_endpoint_discovery.py b/tests/test_endpoint_discovery.py
index 9515ddf..e265bc0 100644
--- a/tests/test_endpoint_discovery.py
+++ b/tests/test_endpoint_discovery.py
@@ -18,7 +18,10 @@ def fake_get(url, timeout):
             return Response(
                 headers={"Content-Type": "text/html"},
                 text="""
+                    <title>Juice Shop</title>
                     <a href="/login">login</a>
+                    <link rel="stylesheet" href="/style.css">
+                    <link rel="icon" href="/favicon.ico">
                     <form action="/submit"></form>
                     <script src="/app.js"></script>
                     <script>fetch("/rest/products")</script>
@@ -33,7 +36,12 @@ def fake_get(url, timeout):
         if url == "http://example.com:3000/api":
             return Response(headers={"Content-Type": "application/json"}, text="{}", url=url)
         if url == "http://example.com:3000/admin":
-            return Response(status_code=403, headers={"Content-Type": "text/html"}, text="", url=url)
+            return Response(
+                status_code=200,
+                headers={"Content-Type": "text/html"},
+                text="<title>Juice Shop</title><main></main>",
+                url=url,
+            )
         return Response(status_code=404, headers={"Content-Type": "text/plain"}, text="", url=url)
 
     monkeypatch.setattr(endpoint_discovery.requests, "get", fake_get)
@@ -61,6 +69,9 @@ def fake_get(url, timeout):
     assert results[0]["base_url"] == "http://example.com:3000"
     assert endpoints["/#/jobs"]["source"] == "response-header:X-Recruiting"
     assert endpoints["/login"]["source"] == "html:href"
+    assert endpoints["/style.css"]["source"] == "html:stylesheet"
+    assert endpoints["/favicon.ico"]["source"] == "html:icon"
+    assert endpoints["/app.js"]["source"] == "html:script-src"
     assert endpoints["/submit"]["source"] == "html:form-action"
     assert endpoints["/rest/products"]["source"] == "html-string"
     assert endpoints["/api/orders"]["source"] == "javascript"
@@ -68,7 +79,8 @@ def fake_get(url, timeout):
     assert endpoints["/robots.txt"]["content_type"] == "text/plain"
     assert endpoints["/hidden"]["source"] == "robots.txt"
     assert endpoints["/api"]["status_code"] == 200
-    assert endpoints["/admin"]["status_code"] == 403
+    assert endpoints["/admin"]["status_code"] == 200
+    assert endpoints["/admin"]["note"] == "Possible SPA fallback route"
     assert not any("cdn.example.net" in url for url, timeout in calls)
 
 
diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py
index 836463e..4bad755 100644
--- a/tests/test_report_generator.py
+++ b/tests/test_report_generator.py
@@ -52,13 +52,36 @@ def test_generate_report_writes_nested_results(tmp_path):
         },
         "Endpoint Discovery": [{
             "base_url": "http://example.com",
-            "endpoints": [{
-                "path": "/api",
-                "source": "well-known",
-                "confidence": "medium",
-                "status_code": 200,
-                "content_type": "application/json",
-            }],
+            "endpoints": [
+                {
+                    "path": "/api",
+                    "source": "well-known",
+                    "confidence": "medium",
+                    "status_code": 200,
+                    "content_type": "application/json",
+                },
+                {
+                    "path": "/login",
+                    "source": "html:href",
+                    "confidence": "medium",
+                    "status_code": 200,
+                    "content_type": "text/html",
+                    "note": "Possible SPA fallback route",
+                },
+                {
+                    "path": "/robots.txt",
+                    "source": "well-known",
+                    "confidence": "medium",
+                    "status_code": 200,
+                    "content_type": "text/plain",
+                },
+                {"path": "/app.js", "source": "html:script-src", "confidence": "medium"},
+                {"path": "/style.css", "source": "html:stylesheet", "confidence": "medium"},
+                {"path": "/favicon.ico", "source": "html:icon", "confidence": "medium"},
+                {"path": "/assets/logo.png", "source": "html:src", "confidence": "medium"},
+                {"path": "/assets/chunk-1.js", "source": "html:script-src", "confidence": "medium"},
+                {"path": "/assets/extra.css", "source": "html:stylesheet", "confidence": "medium"},
+            ],
         }],
         "Attention": [{
             "severity": "low",
@@ -99,7 +122,19 @@ def test_generate_report_writes_nested_results(tmp_path):
     assert "  - `Server`: test" in content
     assert "## Endpoint Discovery" in content
     assert "### http://example.com" in content
+    assert "#### API-like Endpoints" in content
+    assert "#### Frontend Routes" in content
+    assert "#### Well-known / Probed Paths" in content
+    assert "#### Static Assets" in content
     assert "`/api` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** application/json" in content
+    assert "`/login` - **Source:** html:href - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/html - **Note:** Possible SPA fallback route" in content
+    assert "`/robots.txt` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** text/plain" in content
+    assert "- **Total Static Assets:** 6" in content
+    assert "`/app.js` - **Source:** html:script-src" in content
+    assert "`/style.css` - **Source:** html:stylesheet" in content
+    assert "`/favicon.ico` - **Source:** html:icon" in content
+    assert "/assets/extra.css" not in content
+    assert "1 additional static assets omitted from Markdown." in content
     assert "## DNS Analysis" in content
     assert "## TLS Analysis" in content
     assert "TLSv1.3" in content
diff --git a/tests/test_risk_analysis.py b/tests/test_risk_analysis.py
index 4cd4839..230435f 100644
--- a/tests/test_risk_analysis.py
+++ b/tests/test_risk_analysis.py
@@ -124,13 +124,40 @@ def test_generate_attention_findings_reports_cors_and_header_paths_as_info():
     assert cors_finding["evidence"] == "http://example.com"
     assert endpoint_finding["severity"] == "info"
     assert endpoint_finding["message"] == "Interesting path found in response header X-Recruiting"
-    assert endpoint_finding["evidence"] == "/#/jobs"
+    assert endpoint_finding["evidence"] == "http://example.com/#/jobs"
     assert technology_finding["severity"] == "info"
     assert technology_finding["message"] == "X-Powered-By header exposed"
     assert technology_finding["evidence"] == "Express - http://example.com"
     assert all(item.get("evidence") != "no path here" for item in findings)
 
 
+def test_generate_attention_findings_deduplicates_response_header_endpoint_signals():
+    results = {
+        "HTTP Analysis": [{
+            "url": "http://example.com",
+            "headers": {"X-Recruiting": "/#/jobs"},
+        }],
+        "Endpoint Discovery": [{
+            "base_url": "http://example.com",
+            "endpoints": [
+                {"path": "/#/jobs", "source": "response-header:X-Recruiting"},
+            ],
+        }],
+    }
+
+    findings = generate_attention_findings(results)
+    messages = [item["message"] for item in findings]
+    header_findings = [
+        item
+        for item in findings
+        if item["message"] == "Interesting path found in response header X-Recruiting"
+    ]
+
+    assert len(header_findings) == 1
+    assert header_findings[0]["evidence"] == "http://example.com/#/jobs"
+    assert "Interesting endpoint from response header" not in messages
+
+
 def test_generate_attention_findings_reports_endpoint_discovery_signals():
     results = {
         "Endpoint Discovery": [{
@@ -150,13 +177,13 @@ def test_generate_attention_findings_reports_endpoint_discovery_signals():
     findings = generate_attention_findings(results)
     messages = [item["message"] for item in findings]
 
-    assert "API-like endpoint discovered" in messages
-    assert "robots.txt found" in messages
-    assert "robots.txt contains Disallow paths" in messages
-    assert "Interesting endpoint from response header" in messages
-    assert "JavaScript exposes API-like paths" in messages
-    assert "Possible admin/debug/docs route discovered" in messages
-    assert "/ftp endpoint discovered" in messages
+    assert "API-like endpoint discovered; follow-up recommended" in messages
+    assert "robots.txt found; follow-up recommended" in messages
+    assert "robots.txt contains Disallow paths; follow-up recommended" in messages
+    assert "Interesting path found in response header X-Recruiting" in messages
+    assert "JavaScript exposes API-like path candidate" in messages
+    assert "Possible admin/debug/docs route discovered; follow-up recommended" in messages
+    assert "/ftp endpoint discovered; follow-up recommended" in messages
 
 
 def test_generate_attention_findings_reports_expired_tls_certificates():