Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions activerecon/modules/endpoint_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
DEFAULT_ENDPOINT_LIMIT = 50
DEFAULT_HTTP_TIMEOUT = 5
PATH_STRING_RE = re.compile(r"""["'`](/[A-Za-z0-9._~:/?#\[\]@!$&()*+,;=%-]{1,200})["'`]""")
TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", flags=re.IGNORECASE | re.DOTALL)


class EndpointHTMLParser(HTMLParser):
Expand All @@ -34,13 +35,21 @@ def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)

if "href" in attrs_dict:
self.links.append((attrs_dict["href"], "html:href"))
rel = str(attrs_dict.get("rel", "")).lower()
if tag == "link" and "stylesheet" in rel:
self.links.append((attrs_dict["href"], "html:stylesheet"))
elif tag == "link" and "icon" in rel:
self.links.append((attrs_dict["href"], "html:icon"))
else:
self.links.append((attrs_dict["href"], "html:href"))
if tag == "script" and attrs_dict.get("src"):
self.links.append((attrs_dict["src"], "html:script-src"))
self.script_srcs.append(attrs_dict["src"])
if "src" in attrs_dict:
self.links.append((attrs_dict["src"], "html:src"))
if tag != "script":
self.links.append((attrs_dict["src"], "html:src"))
if tag == "form" and attrs_dict.get("action"):
self.links.append((attrs_dict["action"], "html:form-action"))
if tag == "script" and attrs_dict.get("src"):
self.script_srcs.append(attrs_dict["src"])


def _web_recon_settings(config):
Expand Down Expand Up @@ -124,7 +133,7 @@ def _confidence(source):
return "low"


def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None):
def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type=None, note=None):
if not path or not path.startswith("/") or path.startswith("//") or len(path) > 250:
return
if path not in endpoints and len(endpoints) >= limit:
Expand All @@ -141,6 +150,8 @@ def _add_endpoint(endpoints, path, source, limit, status_code=None, content_type
endpoints[path]["status_code"] = status_code
if content_type:
endpoints[path]["content_type"] = content_type
if note:
endpoints[path]["note"] = note


def _extract_paths_from_text(text):
Expand Down Expand Up @@ -175,6 +186,13 @@ def _content_type(response):
return response.headers.get("Content-Type") or response.headers.get("content-type") or ""


def _title_from_html(text):
match = TITLE_RE.search(text or "")
if not match:
return None
return re.sub(r"\s+", " ", match.group(1)).strip()


def _is_found_probe(response):
if response is None:
return False
Expand Down Expand Up @@ -244,8 +262,10 @@ def get_if_allowed(url):
_add_endpoint(endpoints, path, f"response-header:{header_name}", endpoint_limit)

page_response = get_if_allowed(base_url)
root_title = None
if page_response is not None and page_response.status_code < 400 and "html" in _content_type(page_response).lower():
html_text = getattr(page_response, "text", "")[:200000]
root_title = _title_from_html(html_text)
parser = _parse_html(html_text)
for raw_link, source in parser.links:
path = _normalize_candidate(raw_link, base_url, same_origin_only)
Expand All @@ -272,13 +292,22 @@ def get_if_allowed(url):
response = get_if_allowed(urljoin(base_origin, path))
if not _is_found_probe(response):
continue
note = None
if (
response.status_code == 200
and root_title
and "html" in _content_type(response).lower()
and _title_from_html(getattr(response, "text", "")[:200000]) == root_title
):
note = "Possible SPA fallback route"
_add_endpoint(
endpoints,
path,
"well-known",
endpoint_limit,
status_code=response.status_code,
content_type=_content_type(response),
note=note,
)
if path == "/robots.txt" and response.status_code < 400:
for disallow_path in _robots_disallow_paths(getattr(response, "text", "")):
Expand Down
115 changes: 101 additions & 14 deletions activerecon/modules/report_generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import ipaddress
import logging
from pathlib import PurePosixPath
from pathlib import Path


Expand Down Expand Up @@ -73,6 +74,94 @@ def _write_http_result(f, item):
f.write(f" - `{key}`: {value}\n")


STATIC_ASSET_EXTENSIONS = {
".css",
".eot",
".gif",
".ico",
".jpeg",
".jpg",
".js",
".map",
".png",
".svg",
".ttf",
".webp",
".woff",
".woff2",
}
WELL_KNOWN_REPORT_PATHS = {
"/robots.txt",
"/sitemap.xml",
"/.well-known/security.txt",
"/swagger",
"/api-docs",
"/ftp",
}


def _path_without_query(path):
return str(path or "/").split("?", 1)[0].split("#", 1)[0]


def _is_api_like_endpoint(path):
lower_path = str(path or "").lower()
return lower_path == "/api" or lower_path == "/rest" or lower_path.startswith("/api/") or lower_path.startswith("/rest/")


def _is_static_asset(path):
clean_path = _path_without_query(path).lower()
filename = PurePosixPath(clean_path).name
return PurePosixPath(clean_path).suffix in STATIC_ASSET_EXTENSIONS or "chunk" in filename


def _endpoint_category(endpoint):
path = endpoint.get("path", "")
lower_path = str(path).lower()
if _is_static_asset(path):
return "Static Assets"
if _is_api_like_endpoint(path):
return "API-like Endpoints"
if lower_path in WELL_KNOWN_REPORT_PATHS:
return "Well-known / Probed Paths"
return "Frontend Routes"


def _endpoint_line(endpoint):
line = (
f"- `{endpoint.get('path', '/')}` "
f"- **Source:** {endpoint.get('source', 'unknown')} "
f"- **Confidence:** {endpoint.get('confidence', 'low')}"
)
if endpoint.get("status_code") is not None:
line += f" - **Status:** {endpoint['status_code']}"
if endpoint.get("content_type"):
line += f" - **Content-Type:** {endpoint['content_type']}"
if endpoint.get("note"):
line += f" - **Note:** {endpoint['note']}"
return line


def _write_endpoint_category(f, title, endpoints):
if not endpoints:
return
f.write(f"#### {title}\n\n")
if title == "Static Assets":
f.write(f"- **Total Static Assets:** {len(endpoints)}\n")
for endpoint in endpoints[:5]:
f.write(f"{_endpoint_line(endpoint)}\n")
if len(endpoints) > 5:
f.write(f"- {len(endpoints) - 5} additional static assets omitted from Markdown.\n")
f.write("\n")
return

for endpoint in endpoints[:50]:
f.write(f"{_endpoint_line(endpoint)}\n")
if len(endpoints) > 50:
f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
f.write("\n")


def _write_endpoint_discovery(f, endpoint_results):
f.write("## Endpoint Discovery\n\n")
if isinstance(endpoint_results, dict) and endpoint_results.get("error"):
Expand All @@ -92,20 +181,18 @@ def _write_endpoint_discovery(f, endpoint_results):
if not endpoints:
f.write("- No endpoints discovered.\n\n")
continue
for endpoint in endpoints[:50]:
line = (
f"- `{endpoint.get('path', '/')}` "
f"- **Source:** {endpoint.get('source', 'unknown')} "
f"- **Confidence:** {endpoint.get('confidence', 'low')}"
)
if endpoint.get("status_code") is not None:
line += f" - **Status:** {endpoint['status_code']}"
if endpoint.get("content_type"):
line += f" - **Content-Type:** {endpoint['content_type']}"
f.write(f"{line}\n")
if len(endpoints) > 50:
f.write(f"- Output trimmed. {len(endpoints) - 50} additional endpoints omitted.\n")
f.write("\n")

categorized = {
"API-like Endpoints": [],
"Frontend Routes": [],
"Well-known / Probed Paths": [],
"Static Assets": [],
}
for endpoint in endpoints:
categorized[_endpoint_category(endpoint)].append(endpoint)

for title in ("API-like Endpoints", "Frontend Routes", "Well-known / Probed Paths", "Static Assets"):
_write_endpoint_category(f, title, categorized[title])

f.write("---\n\n")

Expand Down
62 changes: 47 additions & 15 deletions activerecon/modules/risk_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from urllib.parse import urlparse


SECURITY_HEADERS = {
Expand Down Expand Up @@ -98,6 +99,33 @@ def _is_admin_debug_docs_path(path):
return any(token in lower_path for token in ("/admin", "/debug", "/swagger", "/api-docs"))


def _url_origin(url):
parsed = urlparse(str(url or ""))
if not parsed.scheme or not parsed.netloc:
return ""
return f"{parsed.scheme}://{parsed.netloc}"


def _endpoint_evidence(base_url, path):
if not base_url:
return str(path or "")
if not path:
return str(base_url)
return f"{str(base_url).rstrip('/')}/{str(path).lstrip('/')}"


def _http_header_path_evidence(item, path):
origin = _url_origin(item.get("final_url") or item.get("url"))
return _endpoint_evidence(origin, path) if origin else str(path or "")


def _header_name_from_source(source):
prefix, separator, header_name = str(source or "").partition(":")
if prefix == "response-header" and separator and header_name:
return header_name
return "response header"


def generate_attention_findings(results, now=None):
findings = []
seen_endpoint_signals = set()
Expand Down Expand Up @@ -142,13 +170,11 @@ def add_endpoint_signal(severity, category, message, evidence):

path = _first_path_like_header_value(value)
if path:
findings.append(
_finding(
"info",
"endpoint",
f"Interesting path found in response header {header_name}",
path,
)
add_endpoint_signal(
"info",
"endpoint",
f"Interesting path found in response header {header_name}",
_http_header_path_evidence(item, path),
)

if item.get("redirect_chain"):
Expand All @@ -157,23 +183,29 @@ def add_endpoint_signal(severity, category, message, evidence):
for base_url, endpoint in _endpoint_items(results):
path = endpoint.get("path", "")
source = endpoint.get("source", "")
evidence = f"{base_url}{path}" if base_url else path
evidence = _endpoint_evidence(base_url, path)

if path == "/robots.txt" and endpoint.get("status_code") is not None:
add_endpoint_signal("info", "endpoint", "robots.txt found", evidence)
add_endpoint_signal("info", "endpoint", "robots.txt found; follow-up recommended", evidence)
if source == "robots.txt":
add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths", evidence)
add_endpoint_signal("info", "endpoint", "robots.txt contains Disallow paths; follow-up recommended", evidence)
if source.startswith("response-header"):
add_endpoint_signal("info", "endpoint", "Interesting endpoint from response header", evidence)
header_name = _header_name_from_source(source)
add_endpoint_signal(
"info",
"endpoint",
f"Interesting path found in response header {header_name}",
evidence,
)
if _is_api_like_path(path):
if source.startswith("javascript"):
add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like paths", evidence)
add_endpoint_signal("info", "endpoint", "JavaScript exposes API-like path candidate", evidence)
else:
add_endpoint_signal("info", "endpoint", "API-like endpoint discovered", evidence)
add_endpoint_signal("info", "endpoint", "API-like endpoint discovered; follow-up recommended", evidence)
if _is_admin_debug_docs_path(path):
add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered", evidence)
add_endpoint_signal("info", "endpoint", "Possible admin/debug/docs route discovered; follow-up recommended", evidence)
if str(path).lower() == "/ftp":
add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered", evidence)
add_endpoint_signal("info", "endpoint", "/ftp endpoint discovered; follow-up recommended", evidence)

now = now or datetime.now(timezone.utc)
comparable_now = now.replace(tzinfo=None)
Expand Down
16 changes: 14 additions & 2 deletions tests/test_endpoint_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ def fake_get(url, timeout):
return Response(
headers={"Content-Type": "text/html"},
text="""
<title>Juice Shop</title>
<a href="/login">login</a>
<link rel="stylesheet" href="/style.css">
<link rel="icon" href="/favicon.ico">
<form action="/submit"></form>
<script src="/app.js"></script>
<script>fetch("/rest/products")</script>
Expand All @@ -33,7 +36,12 @@ def fake_get(url, timeout):
if url == "http://example.com:3000/api":
return Response(headers={"Content-Type": "application/json"}, text="{}", url=url)
if url == "http://example.com:3000/admin":
return Response(status_code=403, headers={"Content-Type": "text/html"}, text="", url=url)
return Response(
status_code=200,
headers={"Content-Type": "text/html"},
text="<title>Juice Shop</title><main></main>",
url=url,
)
return Response(status_code=404, headers={"Content-Type": "text/plain"}, text="", url=url)

monkeypatch.setattr(endpoint_discovery.requests, "get", fake_get)
Expand Down Expand Up @@ -61,14 +69,18 @@ def fake_get(url, timeout):
assert results[0]["base_url"] == "http://example.com:3000"
assert endpoints["/#/jobs"]["source"] == "response-header:X-Recruiting"
assert endpoints["/login"]["source"] == "html:href"
assert endpoints["/style.css"]["source"] == "html:stylesheet"
assert endpoints["/favicon.ico"]["source"] == "html:icon"
assert endpoints["/app.js"]["source"] == "html:script-src"
assert endpoints["/submit"]["source"] == "html:form-action"
assert endpoints["/rest/products"]["source"] == "html-string"
assert endpoints["/api/orders"]["source"] == "javascript"
assert endpoints["/robots.txt"]["status_code"] == 200
assert endpoints["/robots.txt"]["content_type"] == "text/plain"
assert endpoints["/hidden"]["source"] == "robots.txt"
assert endpoints["/api"]["status_code"] == 200
assert endpoints["/admin"]["status_code"] == 403
assert endpoints["/admin"]["status_code"] == 200
assert endpoints["/admin"]["note"] == "Possible SPA fallback route"
assert not any("cdn.example.net" in url for url, timeout in calls)


Expand Down
Loading
Loading