diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go index 76bed3e8..95e7d4dc 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go @@ -266,13 +266,19 @@ func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEnt return &h, nil } -// headerURL prefers the manifest-embedded URL when present (lets the registry -// publisher point individual files at a different host or a versioned path) -// and otherwise constructs one from the loader's baseURL + entry.File. +// headerURL constructs the URL for a per-header JSON by joining the +// loader's configured baseURL with the manifest's relative path. +// +// We DO NOT use entry.URL even when present. The reason: manifests are +// generated against a default `--base-url` (the production CDN), so the +// embedded URLs always point at the prod CDN. Honoring entry.URL would +// silently bypass `--stdlib-base-url` overrides — every test, local +// server, or staging deploy would still hit the production CDN. The +// loader's baseURL is the single source of truth. +// +// entry.URL stays in the schema for forward compatibility (mirrors, +// CDN inspection) but the loader ignores it. func (r *CStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { - if entry.URL != "" { - return entry.URL - } return joinURL(r.baseURL, r.platform, "c", "v1", entry.File) } diff --git a/sast-engine/graph/callgraph/registry/clike_http_test.go b/sast-engine/graph/callgraph/registry/clike_http_test.go index 66a37397..62d981a8 100644 --- a/sast-engine/graph/callgraph/registry/clike_http_test.go +++ b/sast-engine/graph/callgraph/registry/clike_http_test.go @@ -229,19 +229,27 @@ func TestCStdlibRegistry_HTTP_ChecksumUnsupportedFormat(t *testing.T) { assert.Contains(t, err.Error(), "unsupported checksum format") } -func TestCStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { +// TestCStdlibRegistry_HTTP_ManifestEmbeddedURLIgnored pins the post-bugfix +// contract: even when the manifest's per-entry URL points elsewhere, the +// loader ignores it and constructs the fetch URL from its own baseURL. +// +// The bug this guards: the generator stamps every manifest with the +// production CDN URL by default, so any `--stdlib-base-url` override (or +// local HTTP server) would silently bypass the override if entry.URL won. +func TestCStdlibRegistry_HTTP_ManifestEmbeddedURLIgnored(t *testing.T) { f := newCFixture() srv := serveFixture(t, f) - // Override the entry's URL to something at a different path; the loader - // must follow it instead of constructing one from baseURL. - f.manifestC.Headers[0].URL = srv.URL + "/registries/linux/c/v1/stdio_stdlib.json" + // Embed a URL on a host the test fixture doesn't even know about. + // If the loader followed entry.URL, the fetch would fail (or hang). + // The loader must ignore it and use baseURL + entry.File. + f.manifestC.Headers[0].URL = "https://nowhere.test/this/would/not/work.json" r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) withTempCacheRoot(t, r) require.NoError(t, r.LoadManifest(noopLogger{})) h, err := r.GetHeader("stdio.h") - require.NoError(t, err) + require.NoError(t, err, "loader must use its own baseURL, not entry.URL") require.Contains(t, h.Functions, "printf") } @@ -345,16 +353,18 @@ func TestCppStdlibRegistry_HTTP_ChecksumMismatch(t *testing.T) { assert.Contains(t, err.Error(), "digest mismatch") } -func TestCppStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { +// TestCppStdlibRegistry_HTTP_ManifestEmbeddedURLIgnored — C++ counterpart +// to the C-loader test. Same contract: entry.URL is ignored. +func TestCppStdlibRegistry_HTTP_ManifestEmbeddedURLIgnored(t *testing.T) { f := newCFixture() srv := serveFixture(t, f) - f.manifestCpp.Headers[0].URL = srv.URL + "/registries/linux/cpp/v1/vector_stdlib.json" + f.manifestCpp.Headers[0].URL = "https://nowhere.test/would/not/work.json" r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) withTempCacheRootCpp(t, r) require.NoError(t, r.LoadManifest(noopLogger{})) _, err := r.GetClass("vector", "std::vector") - require.NoError(t, err) + require.NoError(t, err, "loader must use its own baseURL, not entry.URL") } // --- helpers -------------------------------------------------------------- diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go index f16e5d71..61abc722 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go @@ -213,13 +213,10 @@ func (r *CppStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderE return &h, nil } -// headerURL prefers the manifest-embedded URL when present and falls back -// to //cpp/v1/ for manifests that predate the -// per-entry URL field. +// headerURL constructs the URL for a per-header JSON by joining the +// loader's configured baseURL with the manifest's relative path. See +// the C loader's headerURL comment for why entry.URL is ignored. func (r *CppStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { - if entry.URL != "" { - return entry.URL - } return joinURL(r.baseURL, r.platform, "cpp", "v1", entry.File) } diff --git a/sast-engine/tools/internal/clikeextract/normalize.go b/sast-engine/tools/internal/clikeextract/normalize.go index 99e64163..6a9e3443 100644 --- a/sast-engine/tools/internal/clikeextract/normalize.go +++ b/sast-engine/tools/internal/clikeextract/normalize.go @@ -118,7 +118,7 @@ func collapseWhitespace(s string) string { // so we should never see it here — but the keep rule means even if we do, we // don't accidentally drop something that's just an oddly-named keyword. func IsPrivateSymbol(name string) bool { - if len(name) == 0 { + if len(name) < 2 { return false } if !strings.HasPrefix(name, "_") { diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat.go b/sast-engine/tools/internal/clikeextract/walker_xplat.go index 3f0c15e6..55c7cec5 100644 --- a/sast-engine/tools/internal/clikeextract/walker_xplat.go +++ b/sast-engine/tools/internal/clikeextract/walker_xplat.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "sort" + "strings" "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" ) @@ -13,19 +14,31 @@ import ( // Windows headers, accessed cross-platform via mingw-w64 on Ubuntu. // // `apt install mingw-w64` on ubuntu-latest places the Win32 + MSVCRT headers -// under /usr/x86_64-w64-mingw32/include and the mingw libstdc++ tree at -// /usr/x86_64-w64-mingw32/include/c++/. Using mingw on Linux beats -// running a Windows GitHub Actions runner for cost and simplicity, and gives -// us a faithful Win32 surface for stdlib resolution. +// under /usr/x86_64-w64-mingw32/include. The mingw libstdc++ tree can live +// in two different places depending on the packaging: +// +// - /usr/x86_64-w64-mingw32/include/c++/ (mingw upstream) +// - /usr/lib/gcc/x86_64-w64-mingw32/-/include/c++ (Ubuntu) +// +// Ubuntu's apt-packaged g++-mingw-w64 splits the C++ headers under the +// gcc tree instead of the mingw sysroot. We probe both layouts so the +// generator works regardless of distro packaging. // // All paths exposed as package vars (rather than literals) so tests can // override them to exercise both the hit and miss branches without depending // on whether the host actually has mingw installed. var ( - // windowsMingwRoot is the canonical mingw-w64 install root. Subdirectories - // `include` (C) and `include/c++/` (C++) live underneath. + // windowsMingwRoot is the canonical mingw-w64 install root. Subdirectory + // `include` (C) lives underneath; C++ may also live here for upstream + // mingw installs. windowsMingwRoot = "/usr/x86_64-w64-mingw32" + // ubuntuMingwGccRoot is the directory holding `-` subdirs + // (e.g. `13-posix`, `13-win32`) on Debian/Ubuntu. The C++ headers live at + // /-/include/c++. We probe this only when the + // upstream layout under windowsMingwRoot doesn't carry C++ headers. + ubuntuMingwGccRoot = "/usr/lib/gcc/x86_64-w64-mingw32" + // darwinSDKRoots is the ordered list of macOS SDK include directories the // generator probes. Command Line Tools first because that's the lighter // install used in CI; Xcode.app is a fallback for full developer setups. @@ -61,28 +74,89 @@ func windowsCSource() (HeaderSource, error) { }, nil } -// windowsCppSource finds the mingw libstdc++ header tree. Returns an error -// when no version directory exists — without one, the C++ surface is -// unrecoverable (the directory name encodes the version). +// windowsCppSource finds the mingw libstdc++ header tree. Probes both the +// upstream mingw layout (headers under the sysroot) and the Debian/Ubuntu +// layout (headers split under /usr/lib/gcc). Returns an error only when +// neither layout yields a versioned directory. // // The walk lists the C++ STL tree only. Win32 C headers are exposed via // windowsCSource; mixing both into one source would conflate languages. func windowsCppSource() (HeaderSource, error) { - root := filepath.Join(windowsMingwRoot, "include", "c++") - dir, version := findVersionedDir(root) + dir, version, layout := findWindowsMingwCppDir() if dir == "" { return HeaderSource{}, fmt.Errorf("windowsCppSource: no mingw libstdc++ headers under %s "+ - "(install with: apt install g++-mingw-w64)", root) + "or %s (install with: apt install g++-mingw-w64)", + filepath.Join(windowsMingwRoot, "include", "c++"), ubuntuMingwGccRoot) } return HeaderSource{ Platform: core.PlatformWindows, Language: core.LanguageCpp, SearchDirs: []string{dir}, HeaderExts: []string{".h", ".hpp", ".hxx", ""}, - SystemTag: "mingw-w64-libstdc++-" + version, + SystemTag: "mingw-w64-libstdc++-" + version + "-" + layout, }, nil } +// findWindowsMingwCppDir locates the mingw libstdc++ header tree. Returns +// (dir, version, layout-tag) where layout-tag identifies which packaging +// shape matched ("upstream" or "ubuntu") so the SystemTag downstream can +// disambiguate. Returns ("","","") when no layout matches. +// +// Layout precedence: +// +// 1. Upstream mingw: /include/c++/ (version-keyed subdirs) +// 2. Ubuntu split: /usr/lib/gcc/x86_64-w64-mingw32/-/include/c++ +// prefers the posix threading variant (the libstdc++ shipped under win32 +// threading is functionally identical but the posix one is the platform +// default for general apps). +func findWindowsMingwCppDir() (dir, version, layout string) { + upstream := filepath.Join(windowsMingwRoot, "include", "c++") + if d, v := findVersionedDir(upstream); d != "" { + return d, v, "upstream" + } + if d, v := findUbuntuMingwCppDir(); d != "" { + return d, v, "ubuntu" + } + return "", "", "" +} + +// findUbuntuMingwCppDir resolves the Debian/Ubuntu mingw libstdc++ tree. +// Scans /usr/lib/gcc/x86_64-w64-mingw32/ for `-` subdirs, +// prefers `-posix`, falls back to `-win32`. Returns ("","") +// when no candidate exists or the candidate doesn't carry an +// include/c++ subtree. +func findUbuntuMingwCppDir() (dir, version string) { + entries, err := os.ReadDir(ubuntuMingwGccRoot) + if err != nil { + return "", "" + } + var posix, win32 []string + for _, e := range entries { + if !e.IsDir() { + continue + } + name := e.Name() + switch { + case strings.HasSuffix(name, "-posix"): + posix = append(posix, name) + case strings.HasSuffix(name, "-win32"): + win32 = append(win32, name) + } + } + for _, candidates := range [][]string{posix, win32} { + if len(candidates) == 0 { + continue + } + sort.Strings(candidates) + newest := candidates[len(candidates)-1] + cppDir := filepath.Join(ubuntuMingwGccRoot, newest, "include", "c++") + if dirExists(cppDir) { + return cppDir, newest + } + } + return "", "" +} + // darwinCSource probes the canonical macOS SDK include locations and uses // the first one that exists. Apple ships Command Line Tools and full Xcode // installs in different subtrees; the loader tries CLT first because that's @@ -165,14 +239,15 @@ func firstExistingDir(candidates []string) string { return "" } -// detectMingwVersion derives a version string from the libstdc++ directory -// name embedded under the mingw root. Returns "unknown" when the tree has -// not been probed yet (windowsCppSource hits this path before the C source -// builder runs). Lightweight on purpose: parsing `gcc --version` would add -// an exec dependency that complicates testing for marginal accuracy gain. +// detectMingwVersion derives a short version tag for the C HeaderSource's +// SystemTag. Uses the libstdc++ directory under whichever mingw layout the +// host actually has installed, so the tag stays consistent across +// upstream-mingw and Ubuntu packaging. Returns "unknown" when neither +// layout has C++ headers (only the C side is installed). Lightweight on +// purpose: parsing `gcc --version` would add an exec dependency that +// complicates testing for marginal accuracy gain. func detectMingwVersion() string { - root := filepath.Join(windowsMingwRoot, "include", "c++") - _, v := findVersionedDir(root) + _, v, _ := findWindowsMingwCppDir() if v == "" { return "unknown" } diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat_test.go b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go index f952dd1f..3f6e4bc8 100644 --- a/sast-engine/tools/internal/clikeextract/walker_xplat_test.go +++ b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go @@ -15,9 +15,28 @@ import ( // independent of order. func withTempMingwRoot(t *testing.T, root string) { t.Helper() - orig := windowsMingwRoot + origMingw := windowsMingwRoot + origUbuntu := ubuntuMingwGccRoot windowsMingwRoot = root - t.Cleanup(func() { windowsMingwRoot = orig }) + // Point the Ubuntu probe at a fresh empty dir so tests on a host with + // real mingw installed don't see Ubuntu's libstdc++ tree as a + // "phantom" fallback. + ubuntuMingwGccRoot = filepath.Join(t.TempDir(), "ubuntu-mingw-absent") + t.Cleanup(func() { + windowsMingwRoot = origMingw + ubuntuMingwGccRoot = origUbuntu + }) +} + +// withTempUbuntuMingwRoot points the Ubuntu mingw probe at root for the +// duration of t. Used by tests that exercise the Ubuntu layout +// specifically — independent of withTempMingwRoot so callers can mix and +// match upstream / Ubuntu probe state. +func withTempUbuntuMingwRoot(t *testing.T, root string) { + t.Helper() + orig := ubuntuMingwGccRoot + ubuntuMingwGccRoot = root + t.Cleanup(func() { ubuntuMingwGccRoot = orig }) } // withTempDarwinRoots replaces both the C and C++ Darwin probe lists for t. @@ -64,7 +83,7 @@ func TestWindowsCSource_VersionUnknownWhenCppTreeMissing(t *testing.T) { assert.Equal(t, "mingw-w64-unknown", src.SystemTag) } -func TestWindowsCppSource_Found(t *testing.T) { +func TestWindowsCppSource_Found_UpstreamLayout(t *testing.T) { root := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "13"), 0o755)) require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "12"), 0o755)) @@ -76,7 +95,42 @@ func TestWindowsCppSource_Found(t *testing.T) { assert.Equal(t, core.LanguageCpp, src.Language) assert.Equal(t, []string{filepath.Join(root, "include", "c++", "13")}, src.SearchDirs, "freshest version directory must win") - assert.Equal(t, "mingw-w64-libstdc++-13", src.SystemTag) + assert.Equal(t, "mingw-w64-libstdc++-13-upstream", src.SystemTag) +} + +// TestWindowsCppSource_Found_UbuntuLayout pins the Debian/Ubuntu +// packaging shape where libstdc++ lives under +// /usr/lib/gcc/x86_64-w64-mingw32/-/include/c++. +// The posix-threading variant must win when both are present. +func TestWindowsCppSource_Found_UbuntuLayout(t *testing.T) { + // Upstream layout absent. + withTempMingwRoot(t, filepath.Join(t.TempDir(), "no-upstream")) + + ubuntu := t.TempDir() + for _, ver := range []string{"13-posix", "13-win32"} { + require.NoError(t, os.MkdirAll(filepath.Join(ubuntu, ver, "include", "c++"), 0o755)) + } + withTempUbuntuMingwRoot(t, ubuntu) + + src, err := windowsCppSource() + require.NoError(t, err) + assert.Equal(t, []string{filepath.Join(ubuntu, "13-posix", "include", "c++")}, src.SearchDirs, + "posix threading variant must win over win32") + assert.Equal(t, "mingw-w64-libstdc++-13-posix-ubuntu", src.SystemTag) +} + +// TestWindowsCppSource_Found_UbuntuLayout_Win32Fallback covers the case +// where only the win32 threading variant is installed. +func TestWindowsCppSource_Found_UbuntuLayout_Win32Fallback(t *testing.T) { + withTempMingwRoot(t, filepath.Join(t.TempDir(), "no-upstream")) + + ubuntu := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(ubuntu, "13-win32", "include", "c++"), 0o755)) + withTempUbuntuMingwRoot(t, ubuntu) + + src, err := windowsCppSource() + require.NoError(t, err) + assert.Equal(t, "mingw-w64-libstdc++-13-win32-ubuntu", src.SystemTag) } func TestWindowsCppSource_Missing(t *testing.T) { @@ -187,7 +241,7 @@ func TestDiscoverHeaderSources_DarwinAndWindowsDispatched(t *testing.T) { target, language, wantTag string }{ {core.PlatformWindows, core.LanguageC, "mingw-w64-13"}, - {core.PlatformWindows, core.LanguageCpp, "mingw-w64-libstdc++-13"}, + {core.PlatformWindows, core.LanguageCpp, "mingw-w64-libstdc++-13-upstream"}, {core.PlatformDarwin, core.LanguageC, "darwin-MacOSX.sdk"}, {core.PlatformDarwin, core.LanguageCpp, "libc++-darwin-v1"}, } {