hw-native-sys
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/golden.py‎
Lines changed: 36 additions & 14 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/golden.py‎
Lines changed: 36 additions & 14 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/aiv/kernel_producer.cpp‎
Lines changed: 2 additions & 1 deletion b/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/aiv/kernel_producer.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/aiv/kernel_producer_async.cpp‎
Lines changed: 43 additions & 46 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/aiv/kernel_producer_async.cpp‎
Lines changed: 43 additions & 46 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/kernel_config.py‎
Lines changed: 23 additions & 6 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/kernel_config.py‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/orchestration/async_demo_orchestration.cpp‎
Lines changed: 50 additions & 22 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_completion_demo/kernels/orchestration/async_demo_orchestration.cpp‎
Lines changed: 50 additions & 22 deletions
@@ -1,18 +1,13 @@
 """
-Golden script for async_completion_demo (dual-mode).
+Golden script for async_completion_demo.
 
-Computation:
-    producer: out[i] = in[i] * 2.0   (with deferred completion)
+Single-card / sim path keeps the original producer-consumer pipeline:
+    producer: out[i] = in[i] * 2.0
     consumer: result[i] = out[i] + 1.0
 
-    So: result[i] = in[i] * 2.0 + 1.0
-    With in = 3.0: result = 7.0
-
-Args layout: [ptr_in, ptr_out, ptr_result, ptr_event_handle_output,
-              size_in, size_out, size_result, size_event_handle_output, SIZE]
-
-event_handle_output: 16 bytes — used by the kernel and scheduler for async
-  completion signaling. Not compared as test output.
+Hardware 2-card path validates `out` and `result`:
+    each rank TGET_ASYNCs the peer rank's `in` into local `out`, then the
+    normal consumer computes `result = out + 1`.
 """
 
 import ctypes
@@ -45,7 +40,34 @@ def generate_inputs(params: dict) -> list:
     ]
 
 
+def generate_distributed_inputs(rank: int, nranks: int, root: int,
+                                comm_ctx=None) -> list:
+    del comm_ctx
+    del nranks
+    del root
+
+    size = 128 * 128
+    inp = [float(i % 251) / 10.0 for i in range(size)]
+    out = [0.0] * size
+    result = [0.0] * size
+
+    return [
+        ("in", inp),
+        ("out", out),
+        ("result", result),
+    ]
+
+
 def compute_golden(tensors: dict, params: dict) -> None:
-    inp = torch.as_tensor(tensors["in"])
-    tensors["result"][:] = inp * 2.0 + 1.0
-    tensors["out"][:] = inp * 2.0
+    if "in" in tensors:
+        inp = torch.as_tensor(tensors["in"])
+        tensors["result"][:] = inp * 2.0 + 1.0
+        tensors["out"][:] = inp * 2.0
+        return
+
+    out = tensors["out"]
+    result = tensors["result"]
+    for i in range(len(out)):
+        value = float(i % 251) / 10.0
+        out[i] = value
+        result[i] = value + 1.0
@@ -85,8 +85,9 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
 #endif
 
     volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
+    pto2_cq_reset(cq);
     pto2_save_expected_completion(PTO2_ENGINE_SDMA, cq,
         PTO2_CQ_COMPLETION_EVENT_FLAG,
         event_flag_addr, 0);
-    pto2_cq_flush();
+    pto2_cq_flush(cq);
 }
@@ -1,92 +1,89 @@
 /**
- * Async Completion Demo - Hardware SDMA Producer Kernel (func_id=2)
+ * Async Completion Demo - Hardware 2P SDMA TGET Producer Kernel (func_id=2)
  *
- * Implements: out[i] = in[i] * 2.0 via TLOAD/TADD/TSTORE, then issues
- * an async SDMA request via pto2_send_request_entry().
+ * Implements:
+ *   1. Read peer rank's input buffer via TGET_ASYNC into local out
+ *   2. Register the async event in the CQ
+ *   3. Return immediately so the runtime completes the task asynchronously
  *
  * This kernel is only compiled for real hardware (a2a3), not for simulation.
  *
  * Kernel args layout (packed by scheduler):
  *   args[0] = &Tensor(in)            — input tensor struct pointer
  *   args[1] = &Tensor(out)           — output tensor struct pointer
- *   args[2] = sdma_context_addr      — SDMA async context
- *   args[3] = cq_addr                — completion queue (appended by submit_deferred)
+ *   args[2] = CommDeviceContext*     — distributed communication context
+ *   args[3] = sdma_context_addr      — SDMA async context
+ *   args[4] = cq_addr                — completion queue (appended by submit_deferred)
  */
 
 #include <cstdint>
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
 #include <pto/pto-inst.hpp>
 #include "pto/comm/pto_comm_inst.hpp"
 #include "pto/npu/comm/async/sdma/sdma_types.hpp"
 #include "pto/common/pto_tile.hpp"
 
+#include "common/comm_context.h"
 #include "tensor.h"
 
 using namespace pto;
 
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
 #include "pto_rq_kernel_api.h"
 
+template <typename T>
+AICORE inline __gm__ T* CommRemotePtr(__gm__ CommDeviceContext* ctx, __gm__ T* local_ptr,
+                                      int peer_rank) {
+    uint64_t local_base = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = (uint64_t)local_ptr - local_base;
+    return (__gm__ T*)(ctx->windowsIn[peer_rank] + offset);
+}
+
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args) {
     __gm__ Tensor* in_tensor  = reinterpret_cast<__gm__ Tensor*>(args[0]);
     __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-    uint64_t sdma_context     = static_cast<uint64_t>(args[2]);
-    uint64_t cq_addr          = static_cast<uint64_t>(args[3]);
+    __gm__ CommDeviceContext* comm_ctx =
+        reinterpret_cast<__gm__ CommDeviceContext*>(args[2]);
+    uint64_t sdma_context     = static_cast<uint64_t>(args[3]);
+    uint64_t cq_addr          = static_cast<uint64_t>(args[4]);
 
     __gm__ float* in_data  = reinterpret_cast<__gm__ float*>(in_tensor->buffer.addr) + in_tensor->start_offset;
     __gm__ float* out_data = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset;
+    volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
+    pto2_cq_reset(cq);
 
-    constexpr int kTRows = 128;
-    constexpr int kTCols = 128;
-    constexpr int kTotalElems = kTRows * kTCols;
+    int my_rank = static_cast<int>(comm_ctx->rankId);
+    int nranks = static_cast<int>(comm_ctx->rankNum);
+    if (nranks != 2) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+    int peer_rank = 1 - my_rank;
 
-    using DynShapeDim5 = Shape<1, 1, 1, kTRows, kTCols>;
-    using DynStridDim5 = Stride<1, 1, 1, kTCols, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, kTRows, kTCols, BLayout::RowMajor, -1, -1>;
+    constexpr int kTotalElems = 128 * 128;
 
     using FlatShape = Shape<1, 1, 1, 1, kTotalElems>;
     using FlatStride = Stride<kTotalElems, kTotalElems, kTotalElems, kTotalElems, 1>;
     using FlatGlobalData = GlobalTensor<float, FlatShape, FlatStride>;
-
-    TileData inTile(kTRows, kTCols);
-    TileData outTile(kTRows, kTCols);
-    TASSIGN(inTile, 0x0);
-    TASSIGN(outTile, 0x10000);
-
-    GlobalData inGlobal(in_data);
-    GlobalData outGlobal(out_data);
     FlatGlobalData outGlobalFlat(out_data);
-
-    // Compute out = in + in = in * 2.0
-    TLOAD(inTile, inGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-    TADD(outTile, inTile, inTile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-
-    TSTORE(outGlobal, outTile);
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    __gm__ float* remote_in_data = CommRemotePtr(comm_ctx, in_data, peer_rank);
+    FlatGlobalData remoteInGlobalFlat(remote_in_data);
 
     using ScratchTile = pto::Tile<pto::TileType::Vec, uint8_t, 1, pto::comm::sdma::UB_ALIGN_SIZE>;
     ScratchTile scratchTile;
     TASSIGN(scratchTile, 0x20000);
 
     __gm__ uint8_t* context = reinterpret_cast<__gm__ uint8_t*>(static_cast<uintptr_t>(sdma_context));
-    volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
 
-    auto desc = pto2_sdma_descriptor(outGlobalFlat, outGlobalFlat, scratchTile, context);
+    auto desc = pto2_sdma_tget_descriptor(outGlobalFlat, remoteInGlobalFlat, scratchTile, context);
     uint64_t tag = pto2_send_request_entry(PTO2_ENGINE_SDMA, PTO2_RQ_ID_AUTO, desc);
     pto2_save_expected_completion(PTO2_ENGINE_SDMA, cq, tag);
 
-    pto2_cq_flush();
+    pto2_cq_flush(cq);
 }
@@ -1,12 +1,9 @@
 """
 Async Completion Demo - Kernel and Orchestration Configuration
 
-Dual-mode demonstration:
-  Sim mode (a2a3sim):  func_id=0 (simulated producer, direct EVENT_FLAG completion)
-  HW mode  (a2a3):     func_id=2 (TPUT_ASYNC producer, EVENT_HANDLE_SLOT completion)
-
-Both modes share func_id=1 (consumer, run-to-completion).
-Orchestration dynamically selects mode based on SDMA context availability.
+Two hardware cards use the existing deferred-completion producer API to
+demonstrate a real 2P TGET_ASYNC remote read. The legacy single-card / sim
+path stays available for local debugging.
 """
 
 import os
@@ -34,6 +31,26 @@
 RUNTIME_CONFIG = {
     "runtime": "tensormap_and_ringbuffer",
     "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
     "block_dim": 3,
     "rounds": 1,
 }
+
+if _platform == "a2a3":
+    RUNTIME_ENV = {
+        "PTO2_ENABLE_SDMA": "1",
+    }
+
+    DISTRIBUTED_CONFIG = {
+        "nranks": 2,
+        "root": 0,
+        "win_sync_prefix": 256,
+        "buffers": [
+            {"name": "in", "dtype": "float32", "count": 128 * 128, "placement": "window"},
+            {"name": "out", "dtype": "float32", "count": 128 * 128, "placement": "window"},
+            {"name": "result", "dtype": "float32", "count": 128 * 128, "placement": "device"},
+        ],
+        "inputs": ["in"],
+        "outputs": ["out", "result"],
+        "args": ["in", "out", "result", "deviceCtx"],
+    }
@@ -1,34 +1,27 @@
 /**
  * Async Completion Demo - Device-side orchestration (CQ model)
  *
- * DAG structure:
- *   t0 (producer): out = in * 2.0  [deferred completion via CQ]
- *   t1 (consumer): result = out + 1.0  [run-to-completion]
- *   Dependency: t0 -> t1 (consumer reads producer's output tensor)
+ * Two execution modes share this file:
+ *
+ * 1. Single-card / sim mode (legacy demo):
+ *    t0 (producer): out = in * 2.0  [deferred completion via CQ]
+ *    t1 (consumer): result = out + 1.0  [run-to-completion]
+ *
+ * 2. Two-card hardware mode:
+ *    both ranks submit one deferred producer task that TGET_ASYNCs the peer
+ *    rank's input buffer into local out, then run the normal consumer on out.
  *
  * CQ model:
  *   Orchestration marks t0 as complete_in_future and passes a CQ address.
  *   The producer kernel decides at runtime what completions it needs and writes
  *   them into the completion queue. The scheduler reads the CQ after the kernel
  *   returns and registers completions dynamically.
- *
- * Dual-mode dispatch:
- *   - Sim mode (no SDMA context): func_id=0
- *     The sim producer kernel writes 1 to a GM flag, then registers an
- *     EVENT_FLAG CQ entry pointing to that flag.
- *   - HW mode (SDMA available): func_id=2
- *     The HW producer kernel issues TPUT_ASYNC, writes the handle to GM,
- *     then registers an EVENT_HANDLE_SLOT CQ entry.
- *
- * Args layout (from golden.py):
- *   [ptr_in, ptr_out, ptr_result, ptr_event_handle_output,
- *    size_in, size_out, size_result, size_event_handle_output, SIZE]
- *   + [gm_heap, heap_size] appended by runtime_maker.cpp
  */
 
 #include <stddef.h>
 #include <stdint.h>
 
+#include "common/comm_context.h"
 #include "pto_orchestration_api.h"
 
 #define ARG_PTR_IN                   0
@@ -48,9 +41,8 @@ extern "C" {
 __attribute__((visibility("default")))
 PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) {
     (void)args;
-    (void)arg_count;
     return PTO2OrchestrationConfig{
-        .expected_arg_count = 9,
+        .expected_arg_count = (arg_count >= 9) ? 9 : 4,
     };
 }
 
@@ -59,7 +51,43 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
                                int orch_thread_num, int orch_thread_index) {
     (void)arg_count;
     (void)orch_thread_num;
-    (void)orch_thread_index;
+    if (orch_thread_index != 0) return;
+
+    if (arg_count == 4) {
+        void* in_ptr  = (void*)(uintptr_t)args[0];
+        void* out_ptr = (void*)(uintptr_t)args[1];
+        void* result_ptr = (void*)(uintptr_t)args[2];
+        auto* comm_ctx = reinterpret_cast<CommDeviceContext*>((uintptr_t)args[3]);
+        int my_rank = (int)comm_ctx->rankId;
+
+        uint32_t shapes[1] = {128 * 128};
+        Tensor ext_in  = make_tensor_external(in_ptr, shapes, 1, DataType::FLOAT32);
+        Tensor ext_out = make_tensor_external(out_ptr, shapes, 1, DataType::FLOAT32);
+        Tensor ext_result = make_tensor_external(result_ptr, shapes, 1, DataType::FLOAT32);
+
+        uint64_t sdma_context = pto2_rt_get_sdma_context();
+        uint64_t cq = pto2_rt_alloc_cq();
+        if (sdma_context == 0 || cq == 0) {
+            LOG_ERROR("async_demo 2P: rank %d failed to get SDMA context or CQ (sdma=0x%lx, cq=0x%lx)",
+                      my_rank, sdma_context, cq);
+            return;
+        }
+
+        PTOParam params_producer;
+        params_producer.add_input(ext_in);
+        params_producer.add_output(ext_out);
+        params_producer.add_scalar((uint64_t)(uintptr_t)comm_ctx);
+        params_producer.add_scalar(sdma_context);
+        pto2_rt_submit_aiv_task_deferred(2, params_producer, cq);
+
+        PTOParam params_consumer;
+        params_consumer.add_input(ext_out);
+        params_consumer.add_output(ext_result);
+        pto2_rt_submit_aiv_task(1, params_consumer);
+
+        LOG_INFO("async_demo 2P: rank %d submitted TGET_ASYNC producer with CQ", my_rank);
+        return;
+    }
 
     void* in_ptr     = (void*)(uintptr_t)args[ARG_PTR_IN];
     void* out_ptr    = (void*)(uintptr_t)args[ARG_PTR_OUT];
@@ -79,14 +107,14 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     Tensor ext_result = make_tensor_external(result_ptr, shapes, 1, DataType::FLOAT32);
 
     if (sdma_context != 0) {
-        // HW mode: kernel issues TPUT_ASYNC, puts event.handle directly in CQ entry.
+        // HW mode: kernel issues async SDMA request and puts event.handle directly in CQ entry.
         PTOParam params_producer;
         params_producer.add_input(ext_in);
         params_producer.add_output(ext_out);
         params_producer.add_scalar(sdma_context);
         pto2_rt_submit_aiv_task_deferred(2, params_producer, cq);
 
-        LOG_INFO("async_demo: HW mode - submitted TPUT_ASYNC producer (func_id=2) with CQ");
+        LOG_INFO("async_demo: HW mode - submitted async SDMA producer (func_id=2) with CQ");
     } else {
         PTOParam params_producer;
         params_producer.add_input(ext_in);