Fix --native subinterpreter merge for shared TIDs using shim stack anchors

pablogsal · pablogsal · commit 77a8d895b335 · 2026-03-08T22:49:58.000Z
When multiple subinterpreters execute on the same OS thread, each
PyThread previously received the full native stack for that TID. That
made native/Python merging fail because every thread in the group saw
the same set of eval frames, so n_eval did not match each thread's
entry-frame count.

This change makes native merging deterministic for same-TID
subinterpreter groups.

The game is played like this:

- Capture a per-thread stack anchor in the native layer:
  - add Thread::StackAnchor() and d_stack_anchor.
  - compute the anchor from the Python frame chain by walking backwards
    to the nearest stack/shim-owned frame (FRAME_OWNED_BY_INTERPRETER /
    FRAME_OWNED_BY_CSTACK on 3.14+, FRAME_OWNED_BY_CSTACK on 3.12/3.13).
- Thread construction now forwards this anchor into PyThread as stack_anchor.
- Switch process/core thread assembly from immediate yielding to collect-then-normalize.
- Group Python threads by tid when native mode is enabled.
- For groups with more than one thread:
  - pick a canonical native stack,
  - sort group members by stack_anchor (stable tie-breaker),
  - partition eval-frame ownership according to each thread's Python entry-frame count,
  - slice native frames accordingly per thread.
- If counts are inconsistent, keep existing behavior for that group and skip slicing.
diff --git a/src/pystack/_pystack.pyx b/src/pystack/_pystack.pyx
@@ -66,6 +66,7 @@ from .types import NativeFrame
 from .types import PyCodeObject
 from .types import PyFrame
 from .types import PyThread
+from .types import frame_type
 
 LOGGER = logging.getLogger(__file__)
 
@@ -490,6 +491,7 @@ cdef object _construct_threads_from_interpreter_state(
                 python_version,
                 interpreter_id,
                 name=get_thread_name(pid, current_thread.Tid()),
+                stack_anchor=current_thread.StackAnchor(),
             )
         )
         current_thread = (
@@ -498,6 +500,91 @@ cdef object _construct_threads_from_interpreter_state(
 
     return threads
 
+
+def _entry_frame_count(thread: PyThread) -> int:
+    return sum(1 for frame in thread.all_frames if frame.is_entry)
+
+
+def _eval_frame_positions(thread: PyThread):
+    if not thread.python_version:
+        return []
+    return [
+        index
+        for index, native_frame in enumerate(thread.native_frames)
+        if frame_type(native_frame, thread.python_version) == NativeFrame.FrameType.EVAL
+    ]
+
+
+def _slice_native_stacks_for_same_tid_threads(threads) -> None:
+    if len(threads) < 2:
+        return
+
+    canonical = next((thread for thread in threads if thread.native_frames), None)
+    if canonical is None:
+        return
+
+    canonical_frames = list(canonical.native_frames)
+    eval_positions = [
+        index
+        for index, native_frame in enumerate(canonical_frames)
+        if frame_type(native_frame, canonical.python_version) == NativeFrame.FrameType.EVAL
+    ]
+    if not eval_positions:
+        return
+
+    entry_counts = [_entry_frame_count(thread) for thread in threads]
+    if sum(entry_counts) != len(eval_positions):
+        LOGGER.debug(
+            "Skipping same-tid native slicing for tid %s due to mismatched counts: "
+            "entry=%s eval=%s",
+            threads[0].tid,
+            sum(entry_counts),
+            len(eval_positions),
+        )
+        return
+
+    ordered_threads = sorted(
+        enumerate(threads),
+        key=lambda item: (
+            item[1].stack_anchor is None,
+            -(item[1].stack_anchor or 0),
+            item[0],
+        ),
+    )
+
+    cursor = 0
+    for _, thread in ordered_threads:
+        required_eval_frames = _entry_frame_count(thread)
+        if required_eval_frames == 0:
+            thread.native_frames = []
+            continue
+
+        group_start = cursor
+        group_end = cursor + required_eval_frames
+        prev_eval = eval_positions[group_start - 1] if group_start > 0 else -1
+        next_eval = (
+            eval_positions[group_end]
+            if group_end < len(eval_positions)
+            else len(canonical_frames)
+        )
+        thread.native_frames = canonical_frames[prev_eval + 1 : next_eval]
+        cursor = group_end
+
+
+def _normalize_python_threads(threads, native_mode: NativeReportingMode):
+    if native_mode == NativeReportingMode.OFF:
+        return threads
+
+    threads_by_tid = {}
+    for thread in threads:
+        threads_by_tid.setdefault(thread.tid, []).append(thread)
+
+    for group in threads_by_tid.values():
+        if len(group) <= 1:
+            continue
+        _slice_native_stacks_for_same_tid_threads(group)
+    return threads
+
 cdef object _construct_os_thread(
     shared_ptr[AbstractProcessManager] manager, int pid, int tid
 ):
@@ -625,6 +712,7 @@ def _get_process_threads(
         )
 
     all_tids = list(manager.get().Tids())
+    threads = []
     while head:
         add_native_traces = native_mode != NativeReportingMode.OFF
         for thread in _construct_threads_from_interpreter_state(
@@ -637,9 +725,12 @@ def _get_process_threads(
         ):
             if thread.tid in all_tids:
                 all_tids.remove(thread.tid)
-            yield thread
+            threads.append(thread)
         head = InterpreterUtils.getNextInterpreter(manager, head)
 
+    for thread in _normalize_python_threads(threads, native_mode):
+        yield thread
+
     if native_mode == NativeReportingMode.ALL:
         yield from _construct_os_threads(manager, pid, all_tids)
 
@@ -772,6 +863,7 @@ def _get_process_threads_for_core(
         )
 
     all_tids = list(manager.get().Tids())
+    threads = []
 
     while head:
         add_native_traces = native_mode != NativeReportingMode.OFF
@@ -785,8 +877,11 @@ def _get_process_threads_for_core(
         ):
             if thread.tid in all_tids:
                 all_tids.remove(thread.tid)
-            yield thread
+            threads.append(thread)
         head = InterpreterUtils.getNextInterpreter(manager, head)
 
+    for thread in _normalize_python_threads(threads, native_mode):
+        yield thread
+
     if native_mode == NativeReportingMode.ALL:
         yield from _construct_os_threads(manager, pymanager.pid, all_tids)
diff --git a/src/pystack/_pystack/pythread.cpp b/src/pystack/_pystack/pythread.cpp
@@ -2,6 +2,7 @@
 #include <cassert>
 #include <memory>
 
+#include "cpython/frame.h"
 #include "cpython/pthread.h"
 #include "interpreter.h"
 #include "logging.h"
@@ -18,6 +19,7 @@ namespace pystack {
 Thread::Thread(pid_t pid, pid_t tid)
 : d_pid(pid)
 , d_tid(tid)
+, d_stack_anchor(0)
 {
 }
 
@@ -27,6 +29,12 @@ Thread::Tid() const
     return d_tid;
 }
 
+remote_addr_t
+Thread::StackAnchor() const
+{
+    return d_stack_anchor;
+}
+
 const std::vector<NativeFrame>&
 Thread::NativeFrames() const
 {
@@ -148,6 +156,7 @@ PyThread::PyThread(const std::shared_ptr<const AbstractProcessManager>& manager,
                    << frame_addr;
         d_first_frame = std::make_unique<FrameObject>(manager, frame_addr, 0);
     }
+    d_stack_anchor = getStackAnchor(manager, frame_addr);
 
     d_addr = addr;
     remote_addr_t candidate_next_addr = ts.getField(&py_thread_v::o_next);
@@ -237,6 +246,44 @@ PyThread::getFrameAddr(
     }
 }
 
+remote_addr_t
+PyThread::getStackAnchor(
+        const std::shared_ptr<const AbstractProcessManager>& manager,
+        remote_addr_t frame_addr)
+{
+    if (!frame_addr) {
+        return 0;
+    }
+    if (!manager->versionIsAtLeast(3, 12)) {
+        return frame_addr;
+    }
+
+    remote_addr_t current_addr = frame_addr;
+    for (int i = 0; i < 4096 && current_addr; ++i) {
+        Structure<py_frame_v> current_frame(manager, current_addr);
+        auto owner = current_frame.getField(&py_frame_v::o_owner);
+
+        if (manager->versionIsAtLeast(3, 14)) {
+            if (owner == Python3_14::FRAME_OWNED_BY_INTERPRETER
+                || owner == Python3_14::FRAME_OWNED_BY_CSTACK)
+            {
+                return current_addr;
+            }
+        } else {
+            if (owner == Python3_12::FRAME_OWNED_BY_CSTACK) {
+                return current_addr;
+            }
+        }
+
+        remote_addr_t next_addr = current_frame.getField(&py_frame_v::o_back);
+        if (next_addr == current_addr) {
+            break;
+        }
+        current_addr = next_addr;
+    }
+    return frame_addr;
+}
+
 std::shared_ptr<FrameObject>
 PyThread::FirstFrame() const
 {
diff --git a/src/pystack/_pystack/pythread.h b/src/pystack/_pystack/pythread.h
@@ -16,6 +16,7 @@ class Thread
   public:
     Thread(pid_t pid, pid_t tid);
     pid_t Tid() const;
+    remote_addr_t StackAnchor() const;
     const std::vector<NativeFrame>& NativeFrames() const;
 
     // Methods
@@ -25,6 +26,7 @@ class Thread
     // Data members
     pid_t d_pid;
     pid_t d_tid;
+    remote_addr_t d_stack_anchor;
     std::vector<NativeFrame> d_native_frames;
 };
 
@@ -50,6 +52,9 @@ class PyThread : public Thread
     static remote_addr_t getFrameAddr(
             const std::shared_ptr<const AbstractProcessManager>& manager,
             Structure<py_thread_v>& ts);
+    static remote_addr_t getStackAnchor(
+            const std::shared_ptr<const AbstractProcessManager>& manager,
+            remote_addr_t frame_addr);
 
   private:
     // Data members
diff --git a/src/pystack/_pystack/pythread.pxd b/src/pystack/_pystack/pythread.pxd
@@ -11,6 +11,7 @@ cdef extern from "pythread.h" namespace "pystack":
     cdef cppclass NativeThread "pystack::Thread":
         NativeThread(int, int) except+
         int Tid()
+        remote_addr_t StackAnchor()
         vector[NativeFrame]& NativeFrames()
         void populateNativeStackTrace(shared_ptr[AbstractProcessManager] manager) except+
 
@@ -28,6 +29,7 @@ cdef extern from "pythread.h" namespace "pystack::PyThread":
 cdef extern from "pythread.h" namespace "pystack":
     cdef cppclass Thread "pystack::PyThread":
         int Tid()
+        remote_addr_t StackAnchor()
         shared_ptr[FrameObject] FirstFrame()
         shared_ptr[Thread] NextThread()
         vector[NativeFrame]& NativeFrames()
diff --git a/src/pystack/types.py b/src/pystack/types.py
@@ -117,6 +117,7 @@ class PyThread:
     python_version: Optional[Tuple[int, int]]
     interpreter_id: Optional[int] = None
     name: Optional[str] = None
+    stack_anchor: Optional[int] = None
 
     @property
     def frames(self) -> Iterable[PyFrame]:
diff --git a/tests/integration/test_subinterpreters.py b/tests/integration/test_subinterpreters.py
diff --git a/tests/utils.py b/tests/utils.py