ModelTC
diff --git a/‎lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py‎
Lines changed: 15 additions & 1 deletion b/‎lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py‎
Lines changed: 15 additions & 1 deletion
@@ -164,20 +164,34 @@ def _nsa_decode_att(
         import flash_mla
 
         nsa_dict = att_control.nsa_decode_dict
+        layer_index = nsa_dict["layer_index"]
         topk_mem_indices = nsa_dict["topk_mem_indices"]
         softmax_scale = nsa_dict["softmax_scale"]
         kv_lora_rank = nsa_dict["kv_lora_rank"]
 
+        mem_manager = self.infer_state.mem_manager
+        if hasattr(mem_manager, "get_decode_kv_cache_and_remap_indices"):
+            kv, topk_mem_indices = mem_manager.get_decode_kv_cache_and_remap_indices(
+                layer_index=layer_index,
+                topk_mem_indices=topk_mem_indices,
+            )
+
         if topk_mem_indices.ndim == 2:
             topk_mem_indices = topk_mem_indices.unsqueeze(1)
         assert topk_mem_indices.shape[1] == 1, "FlashMLA sparse decode path currently expects seq_len_q == 1"
 
         q_nope, q_rope = q
         q_all = torch.cat([q_nope, q_rope], dim=-1).unsqueeze(1).contiguous()
+        if kv.shape[0] == 0:
+            return torch.zeros(
+                (q_nope.shape[0], q_nope.shape[1], kv_lora_rank),
+                dtype=q_nope.dtype,
+                device=q_nope.device,
+            )
 
         o_tensor, _ = flash_mla.flash_mla_with_kvcache(
             q=q_all,
-            k_cache=kv.contiguous(),
+            k_cache=kv if kv.is_contiguous() else kv.contiguous(),
             block_table=None,
             cache_seqlens=None,
             head_dim_v=kv_lora_rank,