From fd97c127880d75d3baa14eb62876403af945e740 Mon Sep 17 00:00:00 2001 From: mvanhorn Date: Sat, 27 Jun 2026 00:58:45 -0700 Subject: [PATCH] fix(pipeline): stop creating Route nodes from URLs in config files Infra Route extraction harvested any URL-like string literal from any YAML/TF/TOML file, so a repo of only config files produced spurious Route nodes (terraform registry URL, a JWKS discovery URL, an upstream host, and a healthcheck shell command). These inflated the Route set that get_architecture and cross-repo matching rely on. Restrict the loose string-ref harvesting to genuine Infrastructure-as-Code files (Terraform / HCL) and require a bare URL value, so generic config, dependabot, compose and k8s/kustomize manifests no longer emit Routes. Structured topic->endpoint bindings still flow through cbm_pipeline_process_infra_bindings(), so real infra endpoints are kept. Fixes #521 Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01QK73cX8EuqqwQEJUbycu6g Signed-off-by: mvanhorn --- src/pipeline/pipeline.c | 60 +++++++++++++++++++++++++++++--- src/pipeline/pipeline_internal.h | 12 +++++++ tests/test_pipeline.c | 37 ++++++++++++++++++++ 3 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 9d99a925b..b74332e1d 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -485,10 +485,57 @@ static void cbm_pipeline_process_infra_bindings(cbm_gbuf_t *gbuf, const cbm_file } } -static bool is_infra_file(const char *fp) { - return fp != NULL && - (strstr(fp, ".yaml") != NULL || strstr(fp, ".yml") != NULL || - strstr(fp, ".tf") != NULL || strstr(fp, ".hcl") != NULL || strstr(fp, ".toml") != NULL); +/* Basename of a path: the segment after the final '/' or '\\'. */ +static const char *infra_basename(const char *fp) { + const char *base = fp; + for (const char *p = fp; *p != '\0'; p++) { + if (*p == '/' || *p == '\\') { + base = p + 1; + } + } + return base; +} + +/* True when `name` ends with `suffix`. */ +static bool infra_ends_with(const char *name, const char *suffix) { + size_t nl = strlen(name); + size_t sl = strlen(suffix); + return nl >= sl && strcmp(name + nl - sl, suffix) == 0; +} + +/* True for Infrastructure-as-Code files whose URL string literals denote real + * service endpoints: Terraform / HCL module sources, backends and provider + * endpoints. Generic application config (config.yaml), dependency manifests + * (dependabot.yaml), container-orchestration (compose.yaml) and Kubernetes / + * Kustomize manifests are NOT route sources — the URL-like strings they hold + * (package registries, JWKS discovery endpoints, upstream hosts, healthcheck + * shell commands) are not routes the service serves, and harvesting them + * inflated the Route set that get_architecture and cross-repo matching rely on + * (issue #521). Structured topic->endpoint bindings in config files still flow + * through cbm_pipeline_process_infra_bindings(). */ +bool cbm_is_infra_route_source_file(const char *fp) { + if (fp == NULL) { + return false; + } + const char *base = infra_basename(fp); + return infra_ends_with(base, ".tf") || infra_ends_with(base, ".tf.json") || + infra_ends_with(base, ".hcl"); +} + +/* A URL string literal denotes a single network endpoint only when it is a + * bare URL: no embedded whitespace. Rejects healthcheck/command strings such as + * "curl --fail http://localhost:9000/ || exit 1" that merely contain a URL, + * while still accepting query-string URLs (which use '&', ';'). */ +bool cbm_is_bare_endpoint_url(const char *value) { + if (value == NULL || value[0] == '\0') { + return false; + } + for (const char *p = value; *p != '\0'; p++) { + if ((unsigned char)*p <= ' ') { + return false; + } + } + return true; } /* Try to create an infra Route node from one string_ref. */ @@ -496,6 +543,9 @@ static void try_upsert_infra_route(cbm_gbuf_t *gbuf, const CBMStringRef *sr, con if (sr->kind != CBM_STRREF_URL || !sr->value || !strstr(sr->value, "://")) { return; } + if (!cbm_is_bare_endpoint_url(sr->value)) { + return; + } char route_qn[CBM_ROUTE_QN_SIZE]; snprintf(route_qn, sizeof(route_qn), "__route__infra__%s", sr->value); char route_props[CBM_SZ_512]; @@ -511,7 +561,7 @@ static void try_upsert_infra_route(cbm_gbuf_t *gbuf, const CBMStringRef *sr, con static void cbm_pipeline_extract_infra_routes(cbm_gbuf_t *gbuf, const cbm_file_info_t *files, CBMFileResult **result_cache, int file_count) { for (int i = 0; i < file_count; i++) { - if (!result_cache[i] || !is_infra_file(files[i].rel_path)) { + if (!result_cache[i] || !cbm_is_infra_route_source_file(files[i].rel_path)) { continue; } for (int si = 0; si < result_cache[i]->string_refs.count; si++) { diff --git a/src/pipeline/pipeline_internal.h b/src/pipeline/pipeline_internal.h index af1a8de12..51aa45130 100644 --- a/src/pipeline/pipeline_internal.h +++ b/src/pipeline/pipeline_internal.h @@ -283,6 +283,18 @@ bool cbm_is_k8s_manifest(const char *name, const char *content); bool cbm_is_secret_binding(const char *key, const char *value); bool cbm_is_secret_value(const char *value); +/* ── Infra Route extraction gating (pipeline.c, issue #521) ───────── */ + +/* True only for Infrastructure-as-Code files (Terraform / HCL) whose URL string + * literals denote real service endpoints. Generic config, compose, dependabot + * and Kubernetes / Kustomize manifests are excluded so their URL-like strings + * are not harvested into spurious Route nodes. */ +bool cbm_is_infra_route_source_file(const char *fp); + +/* True when `value` is a bare URL (no whitespace or shell metacharacters), as + * opposed to a command string that merely embeds a URL. */ +bool cbm_is_bare_endpoint_url(const char *value); + /* Clean JSON array brackets from CMD/ENTRYPOINT values. * E.g. ["./app", "--flag"] → ./app --flag * Writes result to out (up to out_sz). */ diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index aca6c0d78..aa52cb8b6 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -3422,6 +3422,41 @@ TEST(infra_is_compose_file) { PASS(); } +/* Issue #521: only Terraform / HCL files are infra-Route sources. Generic + * config, dependabot, compose and k8s/kustomize manifests must be excluded so + * their URL-like strings do not become spurious Route nodes. */ +TEST(infra_route_source_file_gate) { + /* Infrastructure-as-Code: URL literals are real endpoints. */ + ASSERT(cbm_is_infra_route_source_file("main.tf")); + ASSERT(cbm_is_infra_route_source_file("infra/backend.tf")); + ASSERT(cbm_is_infra_route_source_file("modules/net/main.tf.json")); + ASSERT(cbm_is_infra_route_source_file("config.hcl")); + /* Config / orchestration / dependency manifests: NOT route sources. */ + ASSERT(!cbm_is_infra_route_source_file(".github/dependabot.yaml")); + ASSERT(!cbm_is_infra_route_source_file("config.yaml")); + ASSERT(!cbm_is_infra_route_source_file("compose.yaml")); + ASSERT(!cbm_is_infra_route_source_file("docker-compose.yml")); + ASSERT(!cbm_is_infra_route_source_file("k8s/deployment.yaml")); + ASSERT(!cbm_is_infra_route_source_file("kustomization.yaml")); + ASSERT(!cbm_is_infra_route_source_file("settings.toml")); + ASSERT(!cbm_is_infra_route_source_file(NULL)); + /* Don't false-match Terraform var files. */ + ASSERT(!cbm_is_infra_route_source_file("prod.tfvars")); + PASS(); +} + +/* Issue #521: a healthcheck/command string that merely embeds a URL is not a + * bare endpoint and must not become a Route node. */ +TEST(infra_bare_endpoint_url_gate) { + ASSERT(cbm_is_bare_endpoint_url("https://app.terraform.io")); + ASSERT(cbm_is_bare_endpoint_url("http://order-service:8080/v2/orders/{id}")); + ASSERT(!cbm_is_bare_endpoint_url("curl --fail http://localhost:9000/ || exit 1")); + ASSERT(!cbm_is_bare_endpoint_url("wget http://localhost:8080/health && true")); + ASSERT(!cbm_is_bare_endpoint_url("")); + ASSERT(!cbm_is_bare_endpoint_url(NULL)); + PASS(); +} + TEST(infra_is_cloudbuild_file) { /* Port of TestIsCloudbuildFile (5 cases) */ ASSERT(cbm_is_cloudbuild_file("cloudbuild.yaml")); @@ -6254,6 +6289,8 @@ SUITE(pipeline) { RUN_TEST(compile_commands_parse_invalid); /* Infrascan helpers */ RUN_TEST(infra_is_compose_file); + RUN_TEST(infra_route_source_file_gate); + RUN_TEST(infra_bare_endpoint_url_gate); RUN_TEST(infra_is_cloudbuild_file); RUN_TEST(infra_is_shell_script); RUN_TEST(infra_is_dockerfile);