refactor(pr66-simplify): correct rstd_out semantic name + clarity fixes

zhangyue · zhangyue · commit 8c159ad431fa · 2026-04-22T17:20:31.000+08:00
Post-merge /simplify review findings applied:

- **`AddRmsNorm` param rename** (`src/base/add_rms_norm.h` + 3 Ascend kernels + test):
  `rstd_out` → `residual_out`.  The slot actually holds `xOut` (the
  `input + other` residual sum) per `aclnnAddRmsNorm`'s API — the internal
  `rstd_tensor_` reciprocal-std buffer is private.  Prior name was
  misleading.
- **Generator shim for `apply_rotary_pos_emb`** (`scripts/generate_wrappers.py`):
  rename the `head_size`-as-`rotary_dim` positional forward to a named local
  `rotary_dim_shim` + comment noting the legacy shim assumes full rotary
  (`rotary_dim == head_size`).
- **`kernel_sincos_cache.h` leak comment**: TODO → FIXME with persistent-worker
  impact call-out.  Actual fix still blocked on undocumented input-address
  index layout for `aclnnRopeWithSinCosCache`.

Skipped findings: reviewer false positives on `src/base/rotary_embedding.h`
members (all consumed by kernels) and `max_seq_len_` (used in constructor
body).  Larger refactors (UploadCosSinCache + IndexSelect helpers, ~100
lines copy-paste) deferred to a follow-up PR.
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -342,9 +342,14 @@ def _generate_apply_rotary_pos_emb_shim():
           py::object positions = torch.attr("arange")(
               num_tokens, py::arg("dtype") = torch.attr("int64"),
               py::arg("device") = cos.attr("device"));
+          // Legacy `apply_rotary_pos_emb` has no `rotary_dim` param; it assumes
+          // full rotation (`rotary_dim == head_size`) — partial rotary is not
+          // supported through this shim.  Callers needing partial rotary must
+          // invoke `rotary_embedding` directly with the correct `rotary_dim`.
+          const int64_t rotary_dim_shim = head_size;
           self_module.attr("rotary_embedding")(
-              positions, query, key, cos_sin_cache, head_size,
-              py::int_(head_size), is_neox_style, query_out, key_out,
+              positions, query, key, cos_sin_cache, head_size, rotary_dim_shim,
+              is_neox_style, query_out, key_out,
               /*pre_gathered=*/true,
               py::arg("implementation_index") = implementation_index,
               py::arg("stream") = stream);
diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h
@@ -24,14 +24,14 @@ template <>
 class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
  public:
   Operator(const Tensor input, const Tensor other, const Tensor weight,
-           float eps, Tensor out, Tensor rstd_out)
-      : AddRmsNorm(input, other, weight, eps, out, rstd_out),
+           float eps, Tensor out, Tensor residual_out)
+      : AddRmsNorm(input, other, weight, eps, out, residual_out),
         input_cache_(input),
         other_cache_(other),
         weight_cache_(weight),
         out_cache_(out),
-        rstd_out_cache_(rstd_out) {
-    // Alpha scalar for `aclnnAdd` (`rstd_out = input + 1.0 * other`).
+        residual_out_cache_(residual_out) {
+    // Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * other`).
     alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
 
     // `aclnnRmsNorm` writes `rstd` as a required side output.  Size is
@@ -49,32 +49,32 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
     other_cache_.release();
     weight_cache_.release();
     out_cache_.release();
-    rstd_out_cache_.release();
+    residual_out_cache_.release();
 
     // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
     if (alpha_) aclDestroyScalar(alpha_);
   }
 
   void operator()(const Tensor input, const Tensor other, const Tensor weight,
-                  float eps, Tensor out, Tensor rstd_out) const override {
+                  float eps, Tensor out, Tensor residual_out) const override {
     auto t_input = input_cache_.get(const_cast<void*>(input.data()));
     auto t_other = other_cache_.get(const_cast<void*>(other.data()));
     auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
     auto t_out = out_cache_.get(out.data());
-    auto t_rstd_out = rstd_out_cache_.get(rstd_out.data());
+    auto t_residual_out = residual_out_cache_.get(residual_out.data());
     auto stream = static_cast<aclrtStream>(stream_);
 
-    // Step 1: `rstd_out = input + other`.
+    // Step 1: `residual_out = input + other`.
     if (!add_exec_) {
-      aclnnAddGetWorkspaceSize(t_input, t_other, alpha_, t_rstd_out, &add_ws_,
-                               &add_exec_);
+      aclnnAddGetWorkspaceSize(t_input, t_other, alpha_, t_residual_out,
+                               &add_ws_, &add_exec_);
       aclSetAclOpExecutorRepeatable(add_exec_);
     } else {
       aclSetInputTensorAddr(add_exec_, 0, t_input,
                             const_cast<void*>(input.data()));
       aclSetInputTensorAddr(add_exec_, 1, t_other,
                             const_cast<void*>(other.data()));
-      aclSetOutputTensorAddr(add_exec_, 0, t_rstd_out, rstd_out.data());
+      aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data());
     }
     auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
     aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);
@@ -92,13 +92,13 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
       aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
     }
 
-    // Step 2: `out = rms_norm(rstd_out, weight, eps)`.
+    // Step 2: `out = rms_norm(residual_out, weight, eps)`.
     if (!norm_exec_) {
-      aclnnRmsNormGetWorkspaceSize(t_rstd_out, t_weight, eps, t_out,
+      aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out,
                                    rstd_tensor_, &norm_ws_, &norm_exec_);
       aclSetAclOpExecutorRepeatable(norm_exec_);
     } else {
-      aclSetInputTensorAddr(norm_exec_, 0, t_rstd_out, rstd_out.data());
+      aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data());
       aclSetInputTensorAddr(norm_exec_, 1, t_weight,
                             const_cast<void*>(weight.data()));
       aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data());
@@ -117,7 +117,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
 
   mutable ascend::AclTensorCache out_cache_;
 
-  mutable ascend::AclTensorCache rstd_out_cache_;
+  mutable ascend::AclTensorCache residual_out_cache_;
 
   float alpha_storage_ = 1.0f;
 
diff --git a/src/ascend/add_rms_norm/kernel_custom.h b/src/ascend/add_rms_norm/kernel_custom.h
@@ -29,14 +29,14 @@ namespace infini::ops {
 
 // Custom AscendC fused `AddRmsNorm` kernel (implementation index 2).
 //
-// A single-kernel implementation that computes `rstd_out = input + other`
-// followed by `out = rms_norm(rstd_out, weight, eps)` in one launch,
+// A single-kernel implementation that computes `residual_out = input + other`
+// followed by `out = rms_norm(residual_out, weight, eps)` in one launch,
 // avoiding the decomposed `aclnnAdd` + `aclnnRmsNorm` calls (index 0) or
 // the fused `aclnnAddRmsNorm` call (index 1).  Migrated from the custom
 // `RmsNorm` kernel (index 1 of `RmsNorm`).
 //
 // Select via `implementation_index=2` in Python:
-//   `infini.ops.add_rms_norm(input, other, weight, eps, out, rstd_out,
+//   `infini.ops.add_rms_norm(input, other, weight, eps, out, residual_out,
 //                            implementation_index=2, stream=s)`.
 //
 // Requirements:
@@ -49,8 +49,8 @@ template <>
 class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
  public:
   Operator(const Tensor input, const Tensor other, const Tensor weight,
-           float eps, Tensor out, Tensor rstd_out)
-      : AddRmsNorm(input, other, weight, eps, out, rstd_out) {
+           float eps, Tensor out, Tensor residual_out)
+      : AddRmsNorm(input, other, weight, eps, out, residual_out) {
     // Dtype size in bytes.
     dtype_size_ = (input.dtype() == DataType::kFloat16) ? 2 : 4;
 
@@ -96,7 +96,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
   }
 
   void operator()(const Tensor input, const Tensor other, const Tensor weight,
-                  float eps, Tensor out, Tensor rstd_out) const override {
+                  float eps, Tensor out, Tensor residual_out) const override {
     auto stream = static_cast<aclrtStream>(stream_);
 
     // Determine `float32` `weight` pointer.
@@ -144,7 +144,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
     // Launch custom AscendC kernel.
     aclrtlaunch_add_rms_norm(block_dim, stream, const_cast<void*>(input.data()),
                              const_cast<void*>(other.data()), weight_fp32,
-                             out.data(), rstd_out.data(), total_rows_,
+                             out.data(), residual_out.data(), total_rows_,
                              static_cast<int64_t>(dim_), dim_length_align_,
                              former_num, former_length, tail_length, eps,
                              dtype_size_);
diff --git a/src/ascend/add_rms_norm/kernel_fused.h b/src/ascend/add_rms_norm/kernel_fused.h
@@ -15,25 +15,25 @@ namespace infini::ops {
 
 // Fused implementation via `aclnnAddRmsNorm` (implementation index 1).
 //
-// Computes `rstd_out = input + other` and `out = rms_norm(rstd_out, weight,
-// eps)` in a single CANN launch.  The fused API has higher host-side launch
-// overhead (~200 us) compared to the decomposed `aclnnAdd` + `aclnnRmsNorm`
-// path (~39 us), but may offer better NPU-side efficiency for large tensors
-// where kernel fusion reduces memory traffic.
+// Computes `residual_out = input + other` and `out = rms_norm(residual_out,
+// weight, eps)` in a single CANN launch.  The fused API has higher host-side
+// launch overhead (~200 us) compared to the decomposed `aclnnAdd` +
+// `aclnnRmsNorm` path (~39 us), but may offer better NPU-side efficiency for
+// large tensors where kernel fusion reduces memory traffic.
 //
 // Select via `implementation_index=1` in Python:
 //   infini.ops.add_rms_norm(..., implementation_index=1, stream=s)
 template <>
 class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
  public:
   Operator(const Tensor input, const Tensor other, const Tensor weight,
-           float eps, Tensor out, Tensor rstd_out)
-      : AddRmsNorm(input, other, weight, eps, out, rstd_out),
+           float eps, Tensor out, Tensor residual_out)
+      : AddRmsNorm(input, other, weight, eps, out, residual_out),
         input_cache_(input),
         other_cache_(other),
         weight_cache_(weight),
         out_cache_(out),
-        rstd_out_cache_(rstd_out) {
+        residual_out_cache_(residual_out) {
     // `aclnnAddRmsNorm` requires `rstdOut` to have the same ndim as `input`,
     // with the last `weight.ndim()` dimensions set to 1.  For example:
     //   `input` (2, 32, 128), `weight` (128) -> `rstdOut` (2, 32, 1).
@@ -68,25 +68,25 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
     other_cache_.release();
     weight_cache_.release();
     out_cache_.release();
-    rstd_out_cache_.release();
+    residual_out_cache_.release();
 
     // `rstd_tensor_` leaks with the executor at shutdown (see `64c367c`).
     if (rstd_data_) aclrtFree(rstd_data_);
   }
 
   void operator()(const Tensor input, const Tensor other, const Tensor weight,
-                  float eps, Tensor out, Tensor rstd_out) const override {
+                  float eps, Tensor out, Tensor residual_out) const override {
     auto t_input = input_cache_.get(const_cast<void*>(input.data()));
     auto t_other = other_cache_.get(const_cast<void*>(other.data()));
     auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
     auto t_out = out_cache_.get(out.data());
-    auto t_rstd_out = rstd_out_cache_.get(rstd_out.data());
+    auto t_residual_out = residual_out_cache_.get(residual_out.data());
     auto stream = static_cast<aclrtStream>(stream_);
 
     if (!executor_) {
       aclnnAddRmsNormGetWorkspaceSize(
           t_input, t_other, t_weight, static_cast<double>(eps), t_out,
-          rstd_tensor_, t_rstd_out, &ws_size_, &executor_);
+          rstd_tensor_, t_residual_out, &ws_size_, &executor_);
       aclSetAclOpExecutorRepeatable(executor_);
     } else {
       aclSetInputTensorAddr(executor_, 0, t_input,
@@ -97,7 +97,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
                             const_cast<void*>(weight.data()));
       aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
       // `rstd` at output index 1 has a stable address — no update needed.
-      aclSetOutputTensorAddr(executor_, 2, t_rstd_out, rstd_out.data());
+      aclSetOutputTensorAddr(executor_, 2, t_residual_out, residual_out.data());
     }
 
     auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
@@ -113,7 +113,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
 
   mutable ascend::AclTensorCache out_cache_;
 
-  mutable ascend::AclTensorCache rstd_out_cache_;
+  mutable ascend::AclTensorCache residual_out_cache_;
 
   std::vector<int64_t> fused_rstd_shape_;
 
diff --git a/src/ascend/rotary_embedding/kernel_sincos_cache.h b/src/ascend/rotary_embedding/kernel_sincos_cache.h
@@ -124,17 +124,19 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 2>
     auto t_q_out = q_out_cache_.get(const_cast<void*>(q_out.data()));
     auto t_k_out = k_out_cache_.get(const_cast<void*>(k_out.data()));
 
-    // Fresh executor each call: `aclnnRopeWithSinCosCache`'s public header
-    // hides four `REG_OP` attrs (see
-    // `aclnn_rope_with_sin_cos_cache_hidden_attrs` memory).  The official
+    // FIXME: per-call unbounded executor leak.  `aclnnRopeWithSinCosCache`'s
+    // public header hides four `REG_OP` attrs (see
+    // `aclnn_rope_with_sin_cos_cache_hidden_attrs` memory), so the official
     // `aclSetInputTensorAddr` index numbering for this kernel is not
-    // documented, so we cannot safely reuse a Repeatable executor across
-    // calls.  The async stream consumes the executor after enqueue, so
-    // destroying it synchronously here would race with the launch — we
-    // leak for now.
+    // documented — we cannot safely reuse a Repeatable executor across calls.
+    // The async stream consumes the executor after enqueue, so destroying it
+    // synchronously here races with the launch (SIGABRT).  Long-running
+    // persistent workers (e.g. vLLM decode) accumulate one executor per
+    // forward step until the runtime tears down.
     //
-    // TODO: cache + set Repeatable once the input-address index layout is
-    // confirmed for this kernel.
+    // Resolve by obtaining the input-address index layout from the CANN team
+    // (or deriving it from the binary) and switching to the cached-executor
+    // pattern used in `kernel.h` / `kernel_atb.h`.
     uint64_t ws_size = 0;
     aclOpExecutor* executor = nullptr;
 
diff --git a/src/base/add_rms_norm.h b/src/base/add_rms_norm.h
@@ -11,7 +11,7 @@ namespace infini::ops {
 class AddRmsNorm : public Operator<AddRmsNorm> {
  public:
   AddRmsNorm(const Tensor input, const Tensor other, const Tensor weight,
-             float eps, Tensor out, Tensor rstd_out)
+             float eps, Tensor out, Tensor residual_out)
       : input_shape_{input.shape()},
         eps_{eps},
         dim_{input.size(-1)},
@@ -22,13 +22,14 @@ class AddRmsNorm : public Operator<AddRmsNorm> {
            "`AddRmsNorm`: `input` and `other` must have the same dtype.");
     assert(input.dtype() == out.dtype() &&
            "`AddRmsNorm`: `input` and `out` must have the same dtype.");
-    assert(input.dtype() == rstd_out.dtype() &&
-           "`AddRmsNorm`: `input` and `rstd_out` must have the same dtype.");
+    assert(
+        input.dtype() == residual_out.dtype() &&
+        "`AddRmsNorm`: `input` and `residual_out` must have the same dtype.");
   }
 
   virtual void operator()(const Tensor input, const Tensor other,
                           const Tensor weight, float eps, Tensor out,
-                          Tensor rstd_out) const = 0;
+                          Tensor residual_out) const = 0;
 
  protected:
   Tensor::Shape input_shape_;
diff --git a/tests/test_add_rms_norm.py b/tests/test_add_rms_norm.py
@@ -47,43 +47,43 @@ def test_add_rms_norm(
     other = randn_strided(shape, strides, dtype=dtype, device=device)
     weight = randn_strided(weight_shape, None, dtype=dtype, device=device)
     out = empty_strided(shape, strides, dtype=dtype, device=device)
-    rstd_out = empty_strided(shape, strides, dtype=dtype, device=device)
+    residual_out = empty_strided(shape, strides, dtype=dtype, device=device)
 
     return Payload(
         lambda *args, **kwargs: _add_rms_norm(
             *args, **kwargs, implementation_index=implementation_index
         ),
         _torch_add_rms_norm,
         (input, other, weight),
-        {"eps": eps, "out": out, "rstd_out": rstd_out},
+        {"eps": eps, "out": out, "residual_out": residual_out},
         rtol=rtol,
         atol=atol,
     )
 
 
 def _add_rms_norm(
-    input, other, weight, *, eps=1e-6, out=None, rstd_out=None, implementation_index=0
+    input, other, weight, *, eps=1e-6, out=None, residual_out=None, implementation_index=0
 ):
     infini.ops.add_rms_norm(
         input,
         other,
         weight,
         eps,
         out,
-        rstd_out,
+        residual_out,
         implementation_index=implementation_index,
         stream=get_stream(input.device),
     )
 
     # Concatenate both outputs into a single flat tensor for `allclose` comparison.
-    return torch.cat([out.contiguous().flatten(), rstd_out.contiguous().flatten()])
+    return torch.cat([out.contiguous().flatten(), residual_out.contiguous().flatten()])
 
 
-def _torch_add_rms_norm(input, other, weight, *, eps=1e-6, out=None, rstd_out=None):
+def _torch_add_rms_norm(input, other, weight, *, eps=1e-6, out=None, residual_out=None):
     x_sum = input + other
 
-    if rstd_out is not None:
-        rstd_out.copy_(x_sum)
+    if residual_out is not None:
+        residual_out.copy_(x_sum)
 
     rms = torch.sqrt(
         torch.mean(x_sum.float() * x_sum.float(), dim=-1, keepdim=True) + eps
@@ -93,4 +93,4 @@ def _torch_add_rms_norm(input, other, weight, *, eps=1e-6, out=None, rstd_out=No
     if out is not None:
         out.copy_(y)
 
-    return torch.cat([out.contiguous().flatten(), rstd_out.contiguous().flatten()])
+    return torch.cat([out.contiguous().flatten(), residual_out.contiguous().flatten()])