issue/1090: QY机器添加flash attention (#1099)

xgqdut2016 · qinyiqun · web-flow · commit 69f8fed6ff28 · 2026-04-01T14:06:14.000+08:00
* issue/1090: qy flash-attention

* issue/1090: success link flash-attention.so

* issue/1090: qy flash guard

* issue/1090: success qy flash

* issue/1090: remove unnessesary coda and .contiguous() function

---------

Co-authored-by: qinyiqun &lt;qinyiqun@outlook.com&gt;
diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -5,9 +5,10 @@
 
 #include <ATen/ATen.h>
 
-#ifdef ENABLE_NVIDIA_API
-#include <ATen/cuda/CUDAContext.h>
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
 #endif
 
 namespace infinicore::adaptor {
@@ -33,14 +34,16 @@ inline at::Device to_at_device(const Device &device) {
         return at::Device(at::kCUDA, device.getIndex());
     } else if (device.getType() == Device::Type::CPU) {
         return at::Device(at::kCPU);
+    } else if (device.getType() == Device::Type::QY) {
+        return at::Device(at::kCUDA, device.getIndex());
     } else {
         throw std::runtime_error("Unsupported device type for ATen");
     }
 }
 
 at::Tensor to_aten_tensor(const infinicore::Tensor &t);
 
-#ifdef ENABLE_NVIDIA_API
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 c10::cuda::CUDAStream get_cuda_stream();
 #endif
 } // namespace infinicore::adaptor
diff --git a/src/infinicore/adaptor/aten_adaptor.cc b/src/infinicore/adaptor/aten_adaptor.cc
@@ -32,7 +32,7 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {
         options);
 }
 
-#ifdef ENABLE_NVIDIA_API
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 c10::cuda::CUDAStream get_cuda_stream() {
     return c10::cuda::getStreamFromExternal(
         cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());
diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
@@ -45,7 +45,7 @@ Embedding::Embedding(size_t num_embeddings,
 Tensor Embedding::forward(const Tensor &indices) const {
     // TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach
     auto device_type = device_.getType();
-    if (device_type == Device::Type::NVIDIA || device_type == Device::Type::ILUVATAR || device_type == Device::Type::METAX || device_type == Device::Type::MOORE || device_type == Device::Type::ALI) {
+    if (device_type == Device::Type::NVIDIA || device_type == Device::Type::ILUVATAR || device_type == Device::Type::METAX || device_type == Device::Type::MOORE || device_type == Device::Type::ALI || device_type == Device::Type::QY) {
         // Use op::embedding which supports device-side input and batch dimension
         return op::embedding(indices->contiguous()->to(device_), weight_);
     }
diff --git a/src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc b/src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc
@@ -38,8 +38,13 @@ void run(void *planned_meta) {
 
     auto out_tensor = infinicore::adaptor::to_aten_tensor(p->out);
     auto q = infinicore::adaptor::to_aten_tensor(p->q);
+#if defined(ENABLE_NVIDIA_API)
     auto k_cache = infinicore::adaptor::to_aten_tensor(p->k_cache);
     auto v_cache = infinicore::adaptor::to_aten_tensor(p->v_cache);
+#elif defined(ENABLE_QY_API)
+    auto k_cache = infinicore::adaptor::to_aten_tensor(p->k_cache).contiguous();
+    auto v_cache = infinicore::adaptor::to_aten_tensor(p->v_cache).contiguous();
+#endif
     auto seqlens_k = std::optional<const at::Tensor>(infinicore::adaptor::to_aten_tensor(p->seqlens_k));
     auto block_table = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->block_table));
     auto alibi_slopes = p->alibi_slopes
diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc
@@ -2,7 +2,7 @@
 #include "../../handle.h"
 #include "infiniop/ops/paged_caching.h"
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/paged_caching_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -43,6 +43,9 @@ __INFINI_C infiniStatus_t infiniopCreatePagedCachingDescriptor(
 #endif
 #ifdef ENABLE_MOORE_API
         CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -73,6 +76,9 @@ __INFINI_C infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
 #endif
 #ifdef ENABLE_MOORE_API
         GET(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -107,6 +113,9 @@ __INFINI_C infiniStatus_t infiniopPagedCaching(
 #endif
 #ifdef ENABLE_MOORE_API
         CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -136,6 +145,9 @@ __INFINI_C infiniStatus_t infiniopDestroyPagedCachingDescriptor(
 #endif
 #ifdef ENABLE_MOORE_API
         DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/xmake.lua b/xmake.lua
@@ -247,7 +247,6 @@ if has_config("aten") then
     end
 end
 
-
 -- cuda graph
 option("graph")
     set_default(false)
@@ -259,7 +258,6 @@ if has_config("graph") then
     add_defines("USE_INFINIRT_GRAPH")
 end
 
-
 -- InfiniCCL
 option("ccl")
     set_default(false)
@@ -467,6 +465,22 @@ target("infinicore_cpp_api")
         if has_config("nv-gpu") then
             add_deps("flash-attn-nvidia")
         end
+        if has_config("qy-gpu") then
+            add_deps("flash-attn-qy")
+        end
+    end
+
+    if get_config("flash-attn") and get_config("flash-attn") ~= "" and has_config("qy-gpu") then
+        local flash_so_qy = _qy_flash_attn_cuda_so_path()
+        local flash_dir_qy = path.directory(flash_so_qy)
+        local flash_name_qy = path.filename(flash_so_qy)
+        before_link(function (target)
+            target:add(
+                "shflags",
+                "-Wl,--no-as-needed -L" .. flash_dir_qy .. " -l:" .. flash_name_qy .. " -Wl,-rpath," .. flash_dir_qy,
+                {force = true}
+            )
+        end)
     end
 
     before_build(function (target)
diff --git a/xmake/qy.lua b/xmake/qy.lua
@@ -3,6 +3,38 @@ if CUDNN_ROOT ~= nil then
     add_includedirs(CUDNN_ROOT .. "/include")
 end
 
+local FLASH_ATTN_ROOT = get_config("flash-attn")
+
+local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+function _qy_flash_attn_cuda_so_path()
+    -- Highest priority: override the exact `.so` file to link.
+    local env_path = os.getenv("FLASH_ATTN_2_CUDA_SO")
+    if env_path and env_path ~= "" then
+        env_path = env_path:trim()
+        if os.isfile(env_path) then
+            return env_path
+        end
+        print(string.format("warning: qy+flash-attn: FLASH_ATTN_2_CUDA_SO is not a file: %s, fallback to container/default path", env_path))
+    end
+
+    -- Second priority: allow overriding the "expected" container path via env.
+    local container_path = os.getenv("FLASH_ATTN_QY_CUDA_SO_CONTAINER")
+    if not container_path or container_path == "" then
+        raise("Error: Flash Attention SO path not specified!\n")
+end
+
+    if not os.isfile(container_path) then
+        print(
+            string.format(
+                "warning: qy+flash-attn: expected %s; install flash-attn in conda env, or export FLASH_ATTN_2_CUDA_SO.",
+                container_path
+            )
+        )
+    end
+    return container_path
+end
+
 add_includedirs("/usr/local/denglin/sdk/include", "../include")
 add_linkdirs("/usr/local/denglin/sdk/lib")
 add_links("curt", "cublas", "cudnn")
@@ -44,10 +76,20 @@ rule("qy.cuda")
         local sdk_path = "/usr/local/denglin/sdk"
         local arch = "dlgput64"
 
-        local relpath = path.relative(sourcefile, project.directory())
-        local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o")
+        
+        local relpath = path.relative(sourcefile, os.projectdir())
+
+        relpath = relpath:gsub("%.%.", "__")
+
+        local objfile = path.join(
+            config.buildir(),
+            ".objs",
+            target:name(),
+            "rules",
+            "qy.cuda",
+            relpath .. ".o"
+        )
 
-        -- 🟢 强制注册 .o 文件给 target
         target:add("objectfiles", objfile)
         target:set("buildadd", true)
         local argv = {
@@ -153,3 +195,26 @@ target("infiniccl-qy")
     set_languages("cxx17")
 
 target_end()
+
+target("flash-attn-qy")
+    set_kind("phony")
+    set_default(false)
+    
+
+    if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= "" then
+        before_build(function (target)
+            target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true})
+            local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim()
+            local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim()
+            local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim()
+            
+            -- Validate build/runtime env in container and keep these paths available for downstream linking.
+            target:add("includedirs", TORCH_DIR .. "/include", TORCH_DIR .. "/include/torch/csrc/api/include", PYTHON_INCLUDE, {public = false})
+            target:add("linkdirs", TORCH_DIR .. "/lib", PYTHON_LIB_DIR, {public = false})
+        end)
+    else
+        before_build(function (target)
+            print("Flash Attention not available, skipping flash-attn-qy integration")
+        end)
+    end
+target_end()

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {`
`32`	`32`	`options);`
`33`	`33`	`}`
`34`	`34`
`35`		`-#ifdef ENABLE_NVIDIA_API`
	`35`	`+#if defined(ENABLE_NVIDIA_API) \|\| defined(ENABLE_QY_API)`
`36`	`36`	`c10::cuda::CUDAStream get_cuda_stream() {`
`37`	`37`	`return c10::cuda::getStreamFromExternal(`
`38`	`38`	`cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ Embedding::Embedding(size_t num_embeddings,`
`45`	`45`	`Tensor Embedding::forward(const Tensor &indices) const {`
`46`	`46`	`// TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach`
`47`	`47`	`auto device_type = device_.getType();`
`48`		`- if (device_type == Device::Type::NVIDIA \|\| device_type == Device::Type::ILUVATAR \|\| device_type == Device::Type::METAX \|\| device_type == Device::Type::MOORE \|\| device_type == Device::Type::ALI) {`
	`48`	`+ if (device_type == Device::Type::NVIDIA \|\| device_type == Device::Type::ILUVATAR \|\| device_type == Device::Type::METAX \|\| device_type == Device::Type::MOORE \|\| device_type == Device::Type::ALI \|\| device_type == Device::Type::QY) {`
`49`	`49`	`// Use op::embedding which supports device-side input and batch dimension`
`50`	`50`	`return op::embedding(indices->contiguous()->to(device_), weight_);`
`51`	`51`	`}`