fix

钮圣虓 · 钮圣虓 · commit bdf82d2e3950 · 2026-03-30T19:35:31.000+08:00
diff --git a/lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py b/lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py
@@ -72,12 +72,12 @@ def _nsa_prefill_att(
         import flash_mla
 
         nsa_dict = att_control.nsa_prefill_dict
-        layer_index = nsa_dict["layer_index"]
-        topk_mem_indices = nsa_dict["topk_mem_indices"]
         topk_indices = nsa_dict["topk_indices"]
-        prefill_cache_kv = nsa_dict["prefill_cache_kv"]
         softmax_scale = nsa_dict["softmax_scale"]
         kv_lora_rank = nsa_dict["kv_lora_rank"]
+        layer_index = nsa_dict["layer_index"]
+        topk_mem_indices = nsa_dict["topk_mem_indices"]
+        prefill_cache_kv = nsa_dict["prefill_cache_kv"]
 
         if self.infer_state.prefix_total_token_num > 0:
             kv, topk_indices = self.infer_state.mem_manager.get_prefill_kv_cache_and_remap_indices(
@@ -92,17 +92,12 @@ def _nsa_prefill_att(
         if topk_indices.ndim == 2:
             topk_indices = topk_indices.unsqueeze(1)
 
-        topk_length = torch.sum(topk_indices != -1, dim=-1, dtype=torch.int32)
-        if topk_length.ndim == 2 and topk_length.shape[1] == 1:
-            topk_length = topk_length[:, 0].contiguous()
-
         mla_out, _, _ = flash_mla.flash_mla_sparse_fwd(
-            q=q.contiguous(),
-            kv=kv.contiguous(),
-            indices=topk_indices.contiguous(),
+            q=q,
+            kv=kv,
+            indices=topk_indices,
             sm_scale=softmax_scale,
             d_v=kv_lora_rank,
-            topk_length=topk_length,
         )
         return mla_out
 
@@ -193,4 +188,4 @@ def _nsa_decode_att(
             is_fp8_kvcache=True,
             indices=topk_mem_indices.contiguous(),
         )
-        return o_tensor[:, 0, :, :]
+        return o_tensor[:, 0, :, :]  # [b, 1, h, d] -> [b, h, d]