From 3fce86ac9f014edd5ade797b88d6ae6497292705 Mon Sep 17 00:00:00 2001
From: bingoo <1575938147@qq.com>
Date: Tue, 19 May 2026 19:19:00 +0800
Subject: [PATCH] =?UTF-8?q?[paddle-adapt]=20nvfp4=5Frenorm:=20fix=20CUDAGr?=
 =?UTF-8?q?aphMoE=20for=20Paddle=20compat\n\n-=20=C2=A753:=20torch.cuda.Ex?=
 =?UTF-8?q?ternalStream=20unavailable=20in=20Paddle=20compat.\n=20=20Use?=
 =?UTF-8?q?=20torch.cuda.Stream()=20+=20stream=5Fbase.cuda=5Fstream=20+=20?=
 =?UTF-8?q?runtime.cudaStream=5Ft(raw=5Fptr).\n=20=20Skip=20cudaStreamDest?=
 =?UTF-8?q?roy=20for=20Paddle-managed=20streams=20in=20cleanup().\n-=20?=
 =?UTF-8?q?=C2=A754:=20torch.empty()=20in=20Paddle=20compat=20calls=20cuda?=
 =?UTF-8?q?MemAlloc,=20forbidden=20during\n=20=20CUDA=20stream=20capture?=
 =?UTF-8?q?=20(error=20900=20cudaErrorStreamCaptureUnsupported).\n=20=20Fi?=
 =?UTF-8?q?x:=20pre-allocate=20quantized=20input=20buffers=20before=20capt?=
 =?UTF-8?q?ure=20via=20quantize=5Finputs();\n=20=20=5Frun=5Fmoe=5Fcomputat?=
 =?UTF-8?q?ion()=20reuses=20static=20buffers=20(no=20alloc=20in=20capture?=
 =?UTF-8?q?=20window);\n=20=20launch()=20re-quantizes=20outside=20graph=20?=
 =?UTF-8?q?and=20updates=20buffers=20via=20copy=5F().\n-=20Regression:=20p?=
 =?UTF-8?q?addle=5Fall=5Ftest=5Fcases.sh=20all=20PASS=20(0=20failed,=20ski?=
 =?UTF-8?q?ps=20are=20hw=20limits).\n-=20New=20test=20added=20to=20paddle?=
 =?UTF-8?q?=5Fall=5Ftest=5Fcases.sh.\n\nRefs:=20MISMATCH=5FEXPERIMENT=20?=
 =?UTF-8?q?=C2=A753,=20=C2=A754?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/paddle_all_test_cases.sh       |  3 +
 tests/moe/test_trtllm_gen_fused_moe.py | 80 +++++++++++++++++++++-----
 2 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/scripts/paddle_all_test_cases.sh b/scripts/paddle_all_test_cases.sh
index 0a28e90af3..f4fcbb4163 100755
--- a/scripts/paddle_all_test_cases.sh
+++ b/scripts/paddle_all_test_cases.sh
@@ -84,3 +84,6 @@ python3.12 -m pytest tests/model_optimizations/ --tb=no -q
 # Fix: conftest.py §44-§48 + §52 monkey-patches (Paddle compat assert_close wraps ALL errors with
 #      "resulted in the unexpected exception above"; bfloat16/float16 isclose kernel missing)
 python3 -m pytest tests/comm/test_dcp_alltoall.py --tb=no -q
+
+# PASS (2026-05-19) §53+§54: CUDAGraphMoE ExternalStream + CUDA graph capture fix
+python3.12 -m pytest -rs "tests/moe/test_trtllm_gen_fused_moe.py::test_renormalize_routing[BF16_logits-Swiglu-Shuffled_MajorK-Renorm-NvFP4xNvFP4-384-1024-8-RandomHiddenStates]"
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
index 4d43103d82..02dfabe4c4 100644
--- a/tests/moe/test_trtllm_gen_fused_moe.py
+++ b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -94,6 +94,9 @@ def __init__(self, moe_impl, static_data, **config):
         self.input_tensor = None
         self.output_tensor = None
         self.is_captured = False
+        self._static_quantized = (
+            None  # §54: pre-allocated quantized input buffer for CUDA graph capture
+        )
 
     def capture(self, hidden_states_sample, **runtime_args):
         """Capture CUDA graph with the given sample input."""
@@ -107,12 +110,18 @@ def capture(self, hidden_states_sample, **runtime_args):
             )
 
         # Create stream
-        err, self.stream = runtime.cudaStreamCreate()
-        check_cuda(err)
-
-        # Get the raw stream pointer for PyTorch
-        stream_ptr = int(self.stream)
-        torch_stream = torch.cuda.ExternalStream(stream_ptr)
+        if hasattr(torch.cuda, "ExternalStream"):
+            err, self.stream = runtime.cudaStreamCreate()
+            check_cuda(err)
+            stream_ptr = int(self.stream)
+            torch_stream = torch.cuda.ExternalStream(stream_ptr)
+        else:
+            # §53: Paddle compat - torch.cuda.ExternalStream not available.
+            # Use torch.cuda.Stream() (Paddle-managed) + wrap raw ptr as cudaStream_t.
+            _ts = torch.cuda.Stream()
+            self._torch_stream_ref = _ts  # prevent GC
+            torch_stream = _ts
+            self.stream = runtime.cudaStream_t(_ts.stream_base.cuda_stream)
 
         # Store input tensor reference (will be updated in place during launch)
         self.input_tensor = hidden_states_sample.clone()
@@ -126,6 +135,22 @@ def capture(self, hidden_states_sample, **runtime_args):
         err = runtime.cudaStreamSynchronize(self.stream)[0]
         check_cuda(err)
 
+        # §54: Pre-allocate static quantized input buffers before CUDA graph capture.
+        # In Paddle compat, torch.empty() triggers cudaMemAlloc which is not allowed
+        # during stream capture (cudaErrorStreamCaptureUnsupported / error 900).
+        # We run quantize_inputs once here (outside capture) and store the result tensors
+        # as static buffers; during capture, _run_moe_computation reuses them via copy_().
+        if not hasattr(torch.cuda, "ExternalStream"):
+            _q = self.moe_impl.quantize_inputs(
+                self.input_tensor,
+                self.config["hidden_states_scale_global"],
+                is_swizzling=False,
+            )
+            self._static_quantized = {
+                k: v.clone() if isinstance(v, torch.Tensor) else v
+                for k, v in _q.items()
+            }
+
         # Begin capture
         err, self.graph = runtime.cudaGraphCreate(0)
         check_cuda(err)
@@ -155,6 +180,23 @@ def launch(self, hidden_states_new):
         # Update input tensor in place
         self.input_tensor.copy_(hidden_states_new)
 
+        # §54: Paddle compat - re-quantize inputs before graph replay.
+        # quantize_inputs is outside the captured graph (see _run_moe_computation),
+        # so we must update self._static_quantized with the new input here.
+        if self._static_quantized is not None:
+            _q = self.moe_impl.quantize_inputs(
+                self.input_tensor,
+                self.config["hidden_states_scale_global"],
+                is_swizzling=False,
+            )
+            for k, v in _q.items():
+                if isinstance(v, torch.Tensor) and isinstance(
+                    self._static_quantized.get(k), torch.Tensor
+                ):
+                    self._static_quantized[k].copy_(v)
+                else:
+                    self._static_quantized[k] = v
+
         # Launch graph
         err = runtime.cudaGraphLaunch(self.graph_exec, self.stream)[0]
         check_cuda(err)
@@ -175,20 +217,32 @@ def cleanup(self):
             check_cuda(err)
             self.graph = None
         if self.stream is not None:
-            err = runtime.cudaStreamDestroy(self.stream)[0]
-            check_cuda(err)
+            if not hasattr(self, "_torch_stream_ref"):
+                # Only destroy cudart-created streams, not Paddle-managed ones
+                err = runtime.cudaStreamDestroy(self.stream)[0]
+                check_cuda(err)
             self.stream = None
+            self._torch_stream_ref = None  # release Paddle stream ref
         self.input_tensor = None
         self.output_tensor = None
         self.is_captured = False
 
     def _run_moe_computation(self, runtime_args):
         """Run the MoE computation."""
-        input_quantized = self.moe_impl.quantize_inputs(
-            self.input_tensor,
-            self.config["hidden_states_scale_global"],
-            is_swizzling=False,
-        )
+        if self._static_quantized is not None:
+            # §54: Paddle compat CUDA graph capture fix.
+            # torch.empty() in Paddle compat calls cudaMemAlloc, which is forbidden
+            # during stream capture (cudaErrorStreamCaptureUnsupported / error 900).
+            # Solution: quantize_inputs runs OUTSIDE the capture window (pre-populated
+            # into self._static_quantized), and _run_moe_computation during capture
+            # uses those static buffers directly - no torch.empty() inside capture.
+            input_quantized = self._static_quantized
+        else:
+            input_quantized = self.moe_impl.quantize_inputs(
+                self.input_tensor,
+                self.config["hidden_states_scale_global"],
+                is_swizzling=False,
+            )
 
         output = trtllm_fp4_block_scale_moe(
             routing_logits=runtime_args["expert_logits"],