From bb54b5bdd2cda3c405ad37fa0c0fe92b61542a57 Mon Sep 17 00:00:00 2001
From: CamiloCod3 <deleon88198@gmail.com>
Date: Wed, 17 Jun 2026 15:03:40 +0200
Subject: [PATCH] Add web profile endpoint discovery

---
 README.md                                 |  25 ++
 activerecon/main.py                       |  15 ++
 activerecon/modules/config/config.yaml    |  19 +-
 activerecon/modules/endpoint_discovery.py | 293 ++++++++++++++++++++++
 activerecon/modules/report_generator.py   |  41 +++
 activerecon/modules/risk_analysis.py      |  52 ++++
 tests/test_endpoint_discovery.py          |  90 +++++++
 tests/test_main.py                        |  53 ++++
 tests/test_report_generator.py            |  13 +
 tests/test_risk_analysis.py               |  28 +++
 10 files changed, 628 insertions(+), 1 deletion(-)
 create mode 100644 activerecon/modules/endpoint_discovery.py
 create mode 100644 tests/test_endpoint_discovery.py

diff --git a/README.md b/README.md
index 17b4d2c..9eaaf84 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Instead of manually running separate commands and collecting notes from differen
 * collect HTTP status, headers, redirects, page titles, and technology hints
 * collect TLS certificate metadata for HTTPS services
 * query common DNS records
+* run a richer web reconnaissance workflow from the `web` scan profile
 * generate Markdown and JSON reports
 * highlight interesting signals for follow-up review
 
@@ -75,6 +76,7 @@ ActiveRecon currently supports:
 | HTTP      | Status codes, titles, redirects, headers, technology hints |
 | TLS       | TLS version, cipher, certificate metadata                  |
 | DNS       | A, MX, and TXT lookups                                     |
+| Web       | Endpoint discovery from HTML, headers, JavaScript, and safe well-known paths |
 | Reporting | Markdown and JSON output                                   |
 | Safety    | Scope guard, dry-run mode, doctor checks                   |
 | Analysis  | Interesting signals for follow-up review                   |
@@ -153,6 +155,7 @@ Generated Markdown reports include sections such as:
 ## Scan Information
 ## Port Scan Results
 ## HTTP Analysis
+## Endpoint Discovery
 ## TLS Analysis
 ## DNS Analysis
 ## Interesting Signals
@@ -254,6 +257,25 @@ scan_profiles:
   standard: "-Pn -n -sT -sV -sC -T3"
   full: "-Pn -n -sT -p- -sV -sC -T4"
   udp: "-Pn -n -sU --top-ports 100 -sC --script-timeout 5m"
+
+web_recon:
+  enabled_profiles:
+    - web
+  endpoint_probe_limit: 50
+  fetch_javascript: true
+  same_origin_only: true
+  well_known_paths:
+    - /robots.txt
+    - /sitemap.xml
+    - /.well-known/security.txt
+    - /api
+    - /rest
+    - /ftp
+    - /admin
+    - /login
+    - /debug
+    - /swagger
+    - /api-docs
 ```
 
 ---
@@ -310,6 +332,8 @@ Attention
 
 Markdown reports use the heading `Interesting Signals`. JSON output keeps the `Attention` key for compatibility.
 
+When the `web` profile is used, reports also include `Endpoint Discovery`.
+
 ---
 
 ## Project Structure
@@ -324,6 +348,7 @@ ActiveRecon/
 |       |-- config_loader.py
 |       |-- dns_analysis.py
 |       |-- doctor.py
+|       |-- endpoint_discovery.py
 |       |-- http_enum.py
 |       |-- json_report.py
 |       |-- nmap_scan.py
diff --git a/activerecon/main.py b/activerecon/main.py
index 6d6b0e0..6ae571b 100644
--- a/activerecon/main.py
+++ b/activerecon/main.py
@@ -12,6 +12,7 @@
 from .modules.json_report import generate_json_report
 from .modules.config_loader import load_config
 from .modules.doctor import run_doctor
+from .modules.endpoint_discovery import discover_endpoints
 from .modules.risk_analysis import generate_attention_findings
 from .modules.scope_guard import is_target_in_scope
 from .modules.tls_analysis import analyze_tls
@@ -81,6 +82,12 @@ def _dns_skip_result():
     }
 
 
+def _web_recon_enabled(config, scan_profile):
+    web_recon = config.get("web_recon", {}) if isinstance(config, dict) else {}
+    enabled_profiles = web_recon.get("enabled_profiles", [])
+    return scan_profile in enabled_profiles
+
+
 def _safe_report_name(target):
     safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", target).strip("._-")
     return safe_name or "target"
@@ -219,6 +226,14 @@ def main():
         logging.error(f"Error during TLS analysis: {e}")
         results["TLS Analysis"] = {"error": f"TLS analysis failed: {e}"}
 
+    if _web_recon_enabled(config, chosen_profile):
+        try:
+            logging.info("Running endpoint discovery.")
+            results["Endpoint Discovery"] = discover_endpoints(results["HTTP Analysis"], config)
+        except Exception as e:
+            logging.error(f"Error during endpoint discovery: {e}")
+            results["Endpoint Discovery"] = {"error": f"Endpoint discovery failed: {e}"}
+
     if _is_ip_target(target):
         logging.info(DNS_IP_SKIP_REASON)
         results["DNS Analysis"] = _dns_skip_result()
diff --git a/activerecon/modules/config/config.yaml b/activerecon/modules/config/config.yaml
index d0b83f9..45db610 100644
--- a/activerecon/modules/config/config.yaml
+++ b/activerecon/modules/config/config.yaml
@@ -4,8 +4,25 @@ scan_profiles:
   full: "-Pn -n -sT -p- -sV -sC -T4"
   udp: "-Pn -n -sU --top-ports 100 -sC --script-timeout 5m"
   web: "-Pn -n -sT -p 80,443,3000,5000,8000,8080,8443,9000,9443 -sV -T3"
-
 http_timeout: 5
 nmap_timeout: 300
 # Optional override if Nmap is installed outside PATH.
 # nmap_executable: "C:\\Program Files\\Nmap\\nmap.exe"
+web_recon:
+  enabled_profiles:
+    - web
+  endpoint_probe_limit: 50
+  fetch_javascript: true
+  same_origin_only: true
+  well_known_paths:
+    - /robots.txt
+    - /sitemap.xml
+    - /.well-known/security.txt
+    - /api
+    - /rest
+    - /ftp
+    - /admin
+    - /login
+    - /debug
+    - /swagger
+    - /api-docs
diff --git a/activerecon/modules/endpoint_discovery.py b/activerecon/modules/endpoint_discovery.py
new file mode 100644
index 0000000..d519fc8
--- /dev/null
+++ b/activerecon/modules/endpoint_discovery.py
@@ -0,0 +1,293 @@
+import logging
+import re
+from html.parser import HTMLParser
+from urllib.parse import urljoin, urlparse
+
+import requests
+
+
+DEFAULT_WELL_KNOWN_PATHS = [
+    "/robots.txt",
+    "/sitemap.xml",
+    "/.well-known/security.txt",
+    "/api",
+    "/rest",
+    "/ftp",
+    "/admin",
+    "/login",
+    "/debug",
+    "/swagger",
+    "/api-docs",
+]
+DEFAULT_ENDPOINT_LIMIT = 50
+DEFAULT_HTTP_TIMEOUT = 5
+PATH_STRING_RE = re.compile(r"""["'`](/[A-Za-z0-9._~:/?#\[\]@!$&()*+,;=%-]{1,200})["'`]""")
+
+
+class EndpointHTMLParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = []
+        self.script_srcs = []
+
+    def handle_starttag(self, tag, attrs):
+        attrs_dict = dict(attrs)
+
+        if "href" in attrs_dict:
+            self.links.append((attrs_dict["href"], "html:href"))
+        if "src" in attrs_dict:
+            self.links.append((attrs_dict["src"], "html:src"))
+        if tag == "form" and attrs_dict.get("action"):
+            self.links.append((attrs_dict["action"], "html:form-action"))
+        if tag == "script" and attrs_dict.get("src"):
+            self.script_srcs.append(attrs_dict["src"])
+
+
+def _web_recon_settings(config):
+    web_recon = config.get("web_recon", {}) if isinstance(config, dict) else {}
+    return {
+        "endpoint_probe_limit": web_recon.get("endpoint_probe_limit", DEFAULT_ENDPOINT_LIMIT),
+        "fetch_javascript": web_recon.get("fetch_javascript", True),
+        "same_origin_only": web_recon.get("same_origin_only", True),
+        "well_known_paths": web_recon.get("well_known_paths", DEFAULT_WELL_KNOWN_PATHS),
+    }
+
+
+def _timeout(config):
+    if isinstance(config, dict):
+        return config.get("http_timeout", DEFAULT_HTTP_TIMEOUT)
+    return DEFAULT_HTTP_TIMEOUT
+
+
+def _limit(value):
+    try:
+        return max(0, int(value))
+    except (TypeError, ValueError):
+        return DEFAULT_ENDPOINT_LIMIT
+
+
+def _is_successful_http_result(item):
+    if not isinstance(item, dict) or item.get("error"):
+        return False
+    try:
+        status = int(item.get("status", 0))
+    except (TypeError, ValueError):
+        return False
+    return 200 <= status < 400
+
+
+def _origin(url):
+    parsed = urlparse(url)
+    if not parsed.scheme or not parsed.netloc:
+        return ""
+    return f"{parsed.scheme}://{parsed.netloc}"
+
+
+def _same_origin(url, base_url):
+    return _origin(url) == _origin(base_url)
+
+
+def _path_from_url(url):
+    parsed = urlparse(url)
+    path = parsed.path or "/"
+    if parsed.query:
+        path = f"{path}?{parsed.query}"
+    if parsed.fragment:
+        path = f"{path}#{parsed.fragment}"
+    return path
+
+
+def _normalize_candidate(value, base_url, same_origin_only=True):
+    if not value:
+        return None
+
+    raw_value = str(value).strip()
+    if raw_value.startswith(("mailto:", "tel:", "javascript:", "data:")):
+        return None
+
+    absolute = urljoin(base_url, raw_value)
+    parsed = urlparse(absolute)
+    if parsed.scheme not in {"http", "https"}:
+        return None
+    if same_origin_only and not _same_origin(absolute, base_url):
+        return None
+
+    path = _path_from_url(absolute)
+    if not path.startswith("/") or path.startswith("//"):
+        return None
+    return path
+
+
+def _confidence(source):
+    if source.startswith(("response-header", "well-known", "robots.txt", "html:")):
+        return "medium"
+    return "low"
+
+
+def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None):
+    if not path or not path.startswith("/") or path.startswith("//") or len(path) > 250:
+        return
+    if path not in endpoints and len(endpoints) >= limit:
+        return
+
+    if path not in endpoints:
+        endpoints[path] = {
+            "path": path,
+            "source": source,
+            "confidence": _confidence(source),
+        }
+
+    if status_code is not None:
+        endpoints[path]["status_code"] = status_code
+    if content_type:
+        endpoints[path]["content_type"] = content_type
+
+
+def _extract_paths_from_text(text):
+    paths = []
+    for match in PATH_STRING_RE.finditer(text or ""):
+        path = match.group(1).strip()
+        if path.startswith("/") and not path.startswith("//"):
+            paths.append(path)
+    return paths
+
+
+def _path_like_header_values(value):
+    values = value if isinstance(value, (list, tuple, set)) else [value]
+    paths = []
+    for raw_value in values:
+        for candidate in str(raw_value).split(","):
+            path = candidate.strip().strip("\"'")
+            if path.startswith("/") and not path.startswith("//") and len(path) > 1:
+                paths.append(path)
+    return paths
+
+
+def _safe_get(url, timeout):
+    try:
+        return requests.get(url, timeout=timeout)
+    except requests.RequestException as e:
+        logging.debug(f"Endpoint discovery request failed for {url}: {e}")
+        return None
+
+
+def _content_type(response):
+    return response.headers.get("Content-Type") or response.headers.get("content-type") or ""
+
+
+def _is_found_probe(response):
+    if response is None:
+        return False
+    return response.status_code < 400 or response.status_code in {401, 403}
+
+
+def _robots_disallow_paths(text):
+    paths = []
+    for line in (text or "").splitlines():
+        stripped = line.strip()
+        if not stripped.lower().startswith("disallow:"):
+            continue
+        path = stripped.split(":", 1)[1].strip()
+        if path.startswith("/") and not path.startswith("//"):
+            paths.append(path)
+    return paths
+
+
+def _parse_html(text):
+    parser = EndpointHTMLParser()
+    try:
+        parser.feed(text or "")
+    except Exception as e:
+        logging.debug(f"Endpoint discovery HTML parsing failed: {e}")
+    return parser
+
+
+def discover_endpoints(http_results, config=None):
+    """
+    Discover a small set of interesting endpoints from HTTP results.
+    This intentionally avoids aggressive directory brute forcing.
+    """
+    if not isinstance(http_results, list):
+        return []
+
+    settings = _web_recon_settings(config or {})
+    endpoint_limit = _limit(settings["endpoint_probe_limit"])
+    timeout = _timeout(config)
+    same_origin_only = bool(settings["same_origin_only"])
+    fetch_javascript = bool(settings["fetch_javascript"])
+    well_known_paths = settings["well_known_paths"] or DEFAULT_WELL_KNOWN_PATHS
+    groups = []
+
+    for item in http_results:
+        if not _is_successful_http_result(item):
+            continue
+
+        base_url = item.get("final_url") or item.get("url")
+        if not base_url:
+            continue
+        base_origin = _origin(base_url)
+        if not base_origin:
+            continue
+
+        endpoints = {}
+        requests_made = 0
+
+        def get_if_allowed(url):
+            nonlocal requests_made
+            if requests_made >= endpoint_limit:
+                return None
+            requests_made += 1
+            return _safe_get(url, timeout)
+
+        for header_name, header_value in (item.get("headers") or {}).items():
+            for path in _path_like_header_values(header_value):
+                _add_endpoint(endpoints, path, f"response-header:{header_name}", endpoint_limit)
+
+        page_response = get_if_allowed(base_url)
+        if page_response is not None and page_response.status_code < 400 and "html" in _content_type(page_response).lower():
+            html_text = getattr(page_response, "text", "")[:200000]
+            parser = _parse_html(html_text)
+            for raw_link, source in parser.links:
+                path = _normalize_candidate(raw_link, base_url, same_origin_only)
+                if path:
+                    _add_endpoint(endpoints, path, source, endpoint_limit)
+            for path in _extract_paths_from_text(html_text):
+                _add_endpoint(endpoints, path, "html-string", endpoint_limit)
+
+            if fetch_javascript:
+                for script_src in parser.script_srcs:
+                    script_url = urljoin(base_url, script_src)
+                    if same_origin_only and not _same_origin(script_url, base_url):
+                        continue
+                    script_response = get_if_allowed(script_url)
+                    if script_response is None or script_response.status_code >= 400:
+                        continue
+                    script_text = getattr(script_response, "text", "")[:200000]
+                    for path in _extract_paths_from_text(script_text):
+                        _add_endpoint(endpoints, path, "javascript", endpoint_limit)
+
+        for path in well_known_paths[:endpoint_limit]:
+            if not str(path).startswith("/"):
+                continue
+            response = get_if_allowed(urljoin(base_origin, path))
+            if not _is_found_probe(response):
+                continue
+            _add_endpoint(
+                endpoints,
+                path,
+                "well-known",
+                endpoint_limit,
+                status_code=response.status_code,
+                content_type=_content_type(response),
+            )
+            if path == "/robots.txt" and response.status_code < 400:
+                for disallow_path in _robots_disallow_paths(getattr(response, "text", "")):
+                    _add_endpoint(endpoints, disallow_path, "robots.txt", endpoint_limit)
+
+        if endpoints:
+            groups.append({
+                "base_url": base_origin,
+                "endpoints": list(endpoints.values()),
+            })
+
+    return groups
diff --git a/activerecon/modules/report_generator.py b/activerecon/modules/report_generator.py
index 8af5dd6..c225713 100644
--- a/activerecon/modules/report_generator.py
+++ b/activerecon/modules/report_generator.py
@@ -73,6 +73,43 @@ def _write_http_result(f, item):
             f.write(f"  - `{key}`: {value}\n")
 
 
+def _write_endpoint_discovery(f, endpoint_results):
+    f.write("## Endpoint Discovery\n\n")
+    if isinstance(endpoint_results, dict) and endpoint_results.get("error"):
+        f.write(f"**Error:** {endpoint_results['error']}\n")
+        f.write("---\n\n")
+        return
+
+    groups = endpoint_results if isinstance(endpoint_results, list) else []
+    if not groups:
+        f.write("No endpoints discovered.\n")
+        f.write("---\n\n")
+        return
+
+    for group in groups:
+        f.write(f"### {group.get('base_url', 'Unknown base URL')}\n\n")
+        endpoints = group.get("endpoints", [])
+        if not endpoints:
+            f.write("- No endpoints discovered.\n\n")
+            continue
+        for endpoint in endpoints[:50]:
+            line = (
+                f"- `{endpoint.get('path', '/')}` "
+                f"- **Source:** {endpoint.get('source', 'unknown')} "
+                f"- **Confidence:** {endpoint.get('confidence', 'low')}"
+            )
+            if endpoint.get("status_code") is not None:
+                line += f" - **Status:** {endpoint['status_code']}"
+            if endpoint.get("content_type"):
+                line += f" - **Content-Type:** {endpoint['content_type']}"
+            f.write(f"{line}\n")
+        if len(endpoints) > 50:
+            f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
+        f.write("\n")
+
+    f.write("---\n\n")
+
+
 def build_report_summary(results):
     nmap_results = results.get("Nmap Scan", results)
     ports = _as_list(nmap_results.get("ports", []))
@@ -107,6 +144,7 @@ def generate_report(target, results, output_file):
     logging.info(f"Generating report to: {output_file}")
     nmap_results = results.get("Nmap Scan", results)
     http_results = results.get("HTTP Analysis", [])
+    endpoint_results = results.get("Endpoint Discovery")
     tls_results = results.get("TLS Analysis", [])
     dns_results = results.get("DNS Analysis", {})
     attention_results = results.get("Attention", [])
@@ -176,6 +214,9 @@ def generate_report(target, results, output_file):
             f.write("No HTTP services analyzed.\n")
         f.write("---\n\n")
 
+        if "Endpoint Discovery" in results:
+            _write_endpoint_discovery(f, endpoint_results)
+
         f.write("## TLS Analysis\n\n")
         if isinstance(tls_results, dict) and tls_results.get("error"):
             f.write(f"**Error:** {tls_results['error']}\n")
diff --git a/activerecon/modules/risk_analysis.py b/activerecon/modules/risk_analysis.py
index 3448a33..1b916f7 100644
--- a/activerecon/modules/risk_analysis.py
+++ b/activerecon/modules/risk_analysis.py
@@ -49,6 +49,19 @@ def _dns_results(results):
     return dns_results if isinstance(dns_results, dict) else {}
 
 
+def _endpoint_groups(results):
+    endpoint_results = results.get("Endpoint Discovery", [])
+    return endpoint_results if isinstance(endpoint_results, list) else []
+
+
+def _endpoint_items(results):
+    for group in _endpoint_groups(results):
+        base_url = group.get("base_url", "")
+        for endpoint in group.get("endpoints", []):
+            if isinstance(endpoint, dict):
+                yield base_url, endpoint
+
+
 def _is_https_result(item):
     url = str(item.get("final_url") or item.get("url") or "").lower()
     return url.startswith("https://")
@@ -75,8 +88,26 @@ def _header_value_text(value):
     return str(value).strip()
 
 
+def _is_api_like_path(path):
+    lower_path = str(path).lower()
+    return lower_path == "/api" or lower_path == "/rest" or lower_path.startswith("/api/") or lower_path.startswith("/rest/")
+
+
+def _is_admin_debug_docs_path(path):
+    lower_path = str(path).lower()
+    return any(token in lower_path for token in ("/admin", "/debug", "/swagger", "/api-docs"))
+
+
 def generate_attention_findings(results, now=None):
     findings = []
+    seen_endpoint_signals = set()
+
+    def add_endpoint_signal(severity, category, message, evidence):
+        key = (category, message, evidence)
+        if key in seen_endpoint_signals:
+            return
+        seen_endpoint_signals.add(key)
+        findings.append(_finding(severity, category, message, evidence))
 
     for port in _open_ports(results):
         service = str(port.get("service", "")).lower()
@@ -123,6 +154,27 @@ def generate_attention_findings(results, now=None):
         if item.get("redirect_chain"):
             findings.append(_finding("info", "http", "HTTP redirects observed", " -> ".join(item["redirect_chain"])))
 
+    for base_url, endpoint in _endpoint_items(results):
+        path = endpoint.get("path", "")
+        source = endpoint.get("source", "")
+        evidence = f"{base_url}{path}" if base_url else path
+
+        if path == "/robots.txt" and endpoint.get("status_code") is not None:
+            add_endpoint_signal("info", "endpoint", "robots.txt found", evidence)
+        if source == "robots.txt":
+            add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths", evidence)
+        if source.startswith("response-header"):
+            add_endpoint_signal("info", "endpoint", "Interesting endpoint from response header", evidence)
+        if _is_api_like_path(path):
+            if source.startswith("javascript"):
+                add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like paths", evidence)
+            else:
+                add_endpoint_signal("info", "endpoint", "API-like endpoint discovered", evidence)
+        if _is_admin_debug_docs_path(path):
+            add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered", evidence)
+        if str(path).lower() == "/ftp":
+            add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered", evidence)
+
     now = now or datetime.now(timezone.utc)
     comparable_now = now.replace(tzinfo=None)
     for item in _tls_results(results):
diff --git a/tests/test_endpoint_discovery.py b/tests/test_endpoint_discovery.py
new file mode 100644
index 0000000..9515ddf
--- /dev/null
+++ b/tests/test_endpoint_discovery.py
@@ -0,0 +1,90 @@
+from activerecon.modules import endpoint_discovery
+
+
+class Response:
+    def __init__(self, status_code=200, headers=None, text="", url="http://example.com:3000"):
+        self.status_code = status_code
+        self.headers = headers or {}
+        self.text = text
+        self.url = url
+
+
+def test_discover_endpoints_extracts_html_js_headers_and_safe_probes(monkeypatch):
+    calls = []
+
+    def fake_get(url, timeout):
+        calls.append((url, timeout))
+        if url == "http://example.com:3000":
+            return Response(
+                headers={"Content-Type": "text/html"},
+                text="""
+                    <a href="/login">login</a>
+                    <form action="/submit"></form>
+                    <script src="/app.js"></script>
+                    <script>fetch("/rest/products")</script>
+                    <script src="https://cdn.example.net/external.js"></script>
+                """,
+                url=url,
+            )
+        if url == "http://example.com:3000/app.js":
+            return Response(headers={"Content-Type": "application/javascript"}, text='const api = "/api/orders";', url=url)
+        if url == "http://example.com:3000/robots.txt":
+            return Response(headers={"Content-Type": "text/plain"}, text="Disallow: /hidden\n", url=url)
+        if url == "http://example.com:3000/api":
+            return Response(headers={"Content-Type": "application/json"}, text="{}", url=url)
+        if url == "http://example.com:3000/admin":
+            return Response(status_code=403, headers={"Content-Type": "text/html"}, text="", url=url)
+        return Response(status_code=404, headers={"Content-Type": "text/plain"}, text="", url=url)
+
+    monkeypatch.setattr(endpoint_discovery.requests, "get", fake_get)
+
+    results = endpoint_discovery.discover_endpoints(
+        [{
+            "url": "http://example.com:3000",
+            "final_url": "http://example.com:3000",
+            "status": 200,
+            "headers": {"X-Recruiting": "/#/jobs"},
+        }],
+        {
+            "http_timeout": 2,
+            "web_recon": {
+                "endpoint_probe_limit": 20,
+                "fetch_javascript": True,
+                "same_origin_only": True,
+                "well_known_paths": ["/robots.txt", "/api", "/admin"],
+            },
+        },
+    )
+
+    endpoints = {item["path"]: item for item in results[0]["endpoints"]}
+
+    assert results[0]["base_url"] == "http://example.com:3000"
+    assert endpoints["/#/jobs"]["source"] == "response-header:X-Recruiting"
+    assert endpoints["/login"]["source"] == "html:href"
+    assert endpoints["/submit"]["source"] == "html:form-action"
+    assert endpoints["/rest/products"]["source"] == "html-string"
+    assert endpoints["/api/orders"]["source"] == "javascript"
+    assert endpoints["/robots.txt"]["status_code"] == 200
+    assert endpoints["/robots.txt"]["content_type"] == "text/plain"
+    assert endpoints["/hidden"]["source"] == "robots.txt"
+    assert endpoints["/api"]["status_code"] == 200
+    assert endpoints["/admin"]["status_code"] == 403
+    assert not any("cdn.example.net" in url for url, timeout in calls)
+
+
+def test_discover_endpoints_skips_unsuccessful_http_results(monkeypatch):
+    monkeypatch.setattr(
+        endpoint_discovery.requests,
+        "get",
+        lambda url, timeout: (_ for _ in ()).throw(AssertionError("No requests expected")),
+    )
+
+    results = endpoint_discovery.discover_endpoints(
+        [
+            {"url": "http://example.com", "status": 500, "headers": {}},
+            {"url": "http://example.com", "status": 200, "error": "timeout"},
+        ],
+        {"web_recon": {"endpoint_probe_limit": 5}},
+    )
+
+    assert results == []
diff --git a/tests/test_main.py b/tests/test_main.py
index 9e2179b..9a55d90 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -286,6 +286,59 @@ def fake_report(target, results, output_file):
     }
 
 
+def test_main_web_profile_runs_endpoint_discovery(monkeypatch, tmp_path):
+    output = tmp_path / "report.md"
+    captured = {}
+
+    def fake_nmap(target, scan_command, config):
+        assert scan_command == "-web"
+        return {
+            "target": target,
+            "ports": [{"portid": "3000", "protocol": "tcp", "state": "open", "service": "ppp"}],
+            "status": {"state": "up"},
+            "scan_info": {},
+            "host": target,
+        }
+
+    def fake_http(target, config, http_ports):
+        return [{"url": "http://example.com:3000", "status": 200, "headers": {}}]
+
+    def fake_endpoints(http_results, config):
+        captured["endpoint_http_results"] = http_results
+        return [{"base_url": "http://example.com:3000", "endpoints": [{"path": "/api", "source": "well-known"}]}]
+
+    def fake_report(target, results, output_file):
+        captured["results"] = results
+
+    monkeypatch.setattr(
+        main_module,
+        "CONFIG",
+        {
+            "scan_profiles": {"web": "-web"},
+            "http_timeout": 5,
+            "web_recon": {"enabled_profiles": ["web"]},
+        },
+    )
+    monkeypatch.setattr(main_module, "run_nmap_scan", fake_nmap)
+    monkeypatch.setattr(main_module, "analyze_http", fake_http)
+    monkeypatch.setattr(main_module, "analyze_tls", lambda http_results, timeout: [])
+    monkeypatch.setattr(main_module, "discover_endpoints", fake_endpoints)
+    monkeypatch.setattr(main_module, "analyze_dns", lambda target: {"A": [], "MX": [], "TXT": []})
+    monkeypatch.setattr(main_module, "generate_attention_findings", lambda results: [])
+    monkeypatch.setattr(main_module, "generate_report", fake_report)
+    monkeypatch.setattr(main_module, "generate_json_report", lambda target, results, output_file: None)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["activerecon", "--target", "example.com", "--scan-profile", "web", "--output", str(output)],
+    )
+
+    main_module.main()
+
+    assert captured["endpoint_http_results"] == [{"url": "http://example.com:3000", "status": 200, "headers": {}}]
+    assert captured["results"]["Endpoint Discovery"][0]["endpoints"][0]["path"] == "/api"
+
+
 def test_main_rejects_target_outside_scope(monkeypatch, tmp_path):
     scope = tmp_path / "scope.txt"
     scope.write_text("allowed.example.com\n", encoding="utf-8")
diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py
index 02590c9..836463e 100644
--- a/tests/test_report_generator.py
+++ b/tests/test_report_generator.py
@@ -50,6 +50,16 @@ def test_generate_report_writes_nested_results(tmp_path):
             "TXT": [],
             "errors": {"MX": "missing"},
         },
+        "Endpoint Discovery": [{
+            "base_url": "http://example.com",
+            "endpoints": [{
+                "path": "/api",
+                "source": "well-known",
+                "confidence": "medium",
+                "status_code": 200,
+                "content_type": "application/json",
+            }],
+        }],
         "Attention": [{
             "severity": "low",
             "category": "http",
@@ -87,6 +97,9 @@ def test_generate_report_writes_nested_results(tmp_path):
     assert "  - `server:test`" in content
     assert "- **Response Headers:**" in content
     assert "  - `Server`: test" in content
+    assert "## Endpoint Discovery" in content
+    assert "### http://example.com" in content
+    assert "`/api` - **Source:** well-known - **Confidence:** medium - **Status:** 200 - **Content-Type:** application/json" in content
     assert "## DNS Analysis" in content
     assert "## TLS Analysis" in content
     assert "TLSv1.3" in content
diff --git a/tests/test_risk_analysis.py b/tests/test_risk_analysis.py
index 7d30e63..4cd4839 100644
--- a/tests/test_risk_analysis.py
+++ b/tests/test_risk_analysis.py
@@ -131,6 +131,34 @@ def test_generate_attention_findings_reports_cors_and_header_paths_as_info():
     assert all(item.get("evidence") != "no path here" for item in findings)
 
 
+def test_generate_attention_findings_reports_endpoint_discovery_signals():
+    results = {
+        "Endpoint Discovery": [{
+            "base_url": "http://example.com",
+            "endpoints": [
+                {"path": "/api", "source": "well-known", "status_code": 200},
+                {"path": "/robots.txt", "source": "well-known", "status_code": 200},
+                {"path": "/hidden", "source": "robots.txt"},
+                {"path": "/#/jobs", "source": "response-header:X-Recruiting"},
+                {"path": "/api/orders", "source": "javascript"},
+                {"path": "/admin", "source": "well-known", "status_code": 403},
+                {"path": "/ftp", "source": "well-known", "status_code": 200},
+            ],
+        }],
+    }
+
+    findings = generate_attention_findings(results)
+    messages = [item["message"] for item in findings]
+
+    assert "API-like endpoint discovered" in messages
+    assert "robots.txt found" in messages
+    assert "robots.txt contains Disallow paths" in messages
+    assert "Interesting endpoint from response header" in messages
+    assert "JavaScript exposes API-like paths" in messages
+    assert "Possible admin/debug/docs route discovered" in messages
+    assert "/ftp endpoint discovered" in messages
+
+
 def test_generate_attention_findings_reports_expired_tls_certificates():
     results = {
         "TLS Analysis": [{