drivenets
diff --git a/‎test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_deleter.cpp‎
Lines changed: 100 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_deleter.cpp‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/__init__.py‎
Lines changed: 24 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/__init__.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py‎
Lines changed: 57 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agn_2_11_extension/setup.py‎
Lines changed: 85 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agn_2_11_extension/setup.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎test/cpp_extensions/test_libtorch_agnostic.py‎
Lines changed: 69 additions & 3 deletions b/‎test/cpp_extensions/test_libtorch_agnostic.py‎
Lines changed: 69 additions & 3 deletions
@@ -0,0 +1,100 @@
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#ifdef LAE_USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using torch::stable::Tensor;
+
+// Global counter to track deleter calls for testing
+static int64_t g_deleter_call_count = 0;
+
+static void test_deleter(void* /*data*/) {
+  g_deleter_call_count++;
+}
+
+// Wrapper for from_blob with deleter - uses a test deleter that increments
+// a global counter
+Tensor my_from_blob_with_deleter(
+    int64_t data_ptr,
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> sizes,
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> strides,
+    torch::stable::Device device,
+    torch::headeronly::ScalarType dtype) {
+  void* data = reinterpret_cast<void*>(data_ptr);
+  return torch::stable::from_blob(
+      data, sizes, strides, device, dtype, test_deleter);
+}
+
+int64_t get_deleter_call_count() {
+  return g_deleter_call_count;
+}
+
+void reset_deleter_call_count() {
+  g_deleter_call_count = 0;
+}
+
+STABLE_TORCH_LIBRARY(libtorch_agn_2_11, m) {
+  m.def(
+      "my_from_blob_with_deleter(int data_ptr, int[] sizes, int[] strides, Device device, ScalarType dtype) -> Tensor");
+  m.def("get_deleter_call_count() -> int");
+  m.def("reset_deleter_call_count() -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agn_2_11,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl("my_from_blob_with_deleter", TORCH_BOX(&my_from_blob_with_deleter));
+  m.impl("get_deleter_call_count", TORCH_BOX(&get_deleter_call_count));
+  m.impl("reset_deleter_call_count", TORCH_BOX(&reset_deleter_call_count));
+}
+
+#ifdef LAE_USE_CUDA
+
+// Wrapper for cudaFree since it returns cudaError_t, not void
+static void cuda_deleter(void* data) {
+  cudaFree(data);
+}
+
+// Creates a tensor that owns its CUDA memory via cudaMalloc.
+// When the tensor is destroyed, the deleter will call cudaFree.
+// This tests that from_blob's deleter properly frees memory.
+Tensor my_from_blob_with_cuda_deleter(
+    int64_t numel,
+    torch::stable::Device device) {
+  size_t size_bytes = numel * sizeof(float);
+
+  void* data = nullptr;
+  cudaError_t err = cudaMalloc(&data, size_bytes);
+  if (err != cudaSuccess) {
+    throw std::runtime_error("cudaMalloc failed");
+  }
+
+  // Zero the memory
+  cudaMemset(data, 0, size_bytes);
+
+  std::array<int64_t, 1> sizes = {numel};
+  std::array<int64_t, 1> strides = {1};
+
+  return torch::stable::from_blob(
+      data,
+      torch::headeronly::HeaderOnlyArrayRef<int64_t>(sizes.data(), sizes.size()),
+      torch::headeronly::HeaderOnlyArrayRef<int64_t>(strides.data(), strides.size()),
+      device,
+      torch::headeronly::ScalarType::Float,
+      cuda_deleter);
+}
+
+STABLE_TORCH_LIBRARY(libtorch_agn_2_11_cuda, m) {
+  m.def("my_from_blob_with_cuda_deleter(int numel, Device device) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agn_2_11_cuda, CompositeExplicitAutograd, m) {
+  m.impl("my_from_blob_with_cuda_deleter", TORCH_BOX(&my_from_blob_with_cuda_deleter));
+}
+
+#endif  // LAE_USE_CUDA
@@ -0,0 +1,24 @@
+import ctypes
+import sys
+from pathlib import Path
+
+import torch
+
+
+so_files = list(
+    Path(__file__).parent.glob("_C*" + (".pyd" if sys.platform == "win32" else ".so"))
+)
+assert len(so_files) == 1, f"Expected one _C*.{{so,pyd}} file, found {len(so_files)}"
+
+# use ctypes.CDLL instead of load_library to be able to test the unload logic
+# below code is reduced from the load_library code
+with torch._ops.dl_open_guard():
+    loaded_lib = ctypes.CDLL(str(so_files[0]))
+
+from . import ops
+
+
+__all__ = [
+    "loaded_lib",
+    "ops",
+]
@@ -0,0 +1,57 @@
+import torch
+from torch import Tensor
+
+
+def my_from_blob_with_deleter(data_ptr, sizes, strides, device, dtype) -> Tensor:
+    """
+    Creates a Tensor from existing memory with a deleter callback.
+
+    The deleter will be called when the tensor's storage is deallocated. For
+    this test, the deleter just updates a global call count, which allows us to
+    assert that is was called from get_deleter_call_count().
+
+    Args:
+        data_ptr: int - pointer to the data buffer
+        sizes: tuple[int] - size of the tensor
+        strides: tuple[int] - strides of the tensor
+        device: Device - device on which the tensor resides
+        dtype: ScalarType - data type of the tensor
+
+    Returns: Tensor - tensor wrapping the existing memory
+    """
+    return torch.ops.libtorch_agn_2_11.my_from_blob_with_deleter.default(
+        data_ptr, sizes, strides, device, dtype
+    )
+
+
+def get_deleter_call_count() -> int:
+    """
+    Returns the number of times the test deleter has been called.
+    """
+    return torch.ops.libtorch_agn_2_11.get_deleter_call_count.default()
+
+
+def reset_deleter_call_count() -> None:
+    """
+    Resets the deleter call counter to zero.
+    """
+    torch.ops.libtorch_agn_2_11.reset_deleter_call_count.default()
+
+
+def my_from_blob_with_cuda_deleter(numel: int, device) -> Tensor:
+    """
+    Creates a CUDA tensor that owns its memory via cudaMalloc.
+
+    The tensor's memory is allocated with cudaMalloc and will be freed
+    with cudaFree when the tensor is destroyed (via from_blob's deleter).
+    This is useful for testing that the deleter properly frees memory.
+
+    Args:
+        numel: int - number of elements in the tensor
+        device: Device - CUDA device
+
+    Returns: Tensor - a 1D float32 tensor of zeros
+    """
+    return torch.ops.libtorch_agn_2_11_cuda.my_from_blob_with_cuda_deleter.default(
+        numel, device
+    )
@@ -0,0 +1,85 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+    IS_WINDOWS,
+)
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "libtorch_agn_2_11").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "libtorch_agn_2_11.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": [
+            "-DTORCH_TARGET_VERSION=0x020b000000000000",
+        ],
+    }
+    if not IS_WINDOWS:
+        extra_compile_args["cxx"].append("-fdiagnostics-color=always")
+
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = ["-O2", "-DUSE_CUDA"]
+        extension = CUDAExtension
+        sources.extend(CSRC_DIR.glob("**/*.cu"))
+
+    return [
+        extension(
+            "libtorch_agn_2_11._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="libtorch_agn_2_11",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Example of libtorch agnostic extension for PyTorch 2.11+",
+    packages=find_packages(exclude=("test",)),
+    package_data={"libtorch_agn_2_11": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
@@ -1,5 +1,6 @@
 # Owner(s): ["module: cpp"]
 
+import gc
 import math
 import sysconfig
 import unittest
@@ -80,18 +81,19 @@ class TestLibtorchAgnostic(TestCase):
     """
     Tests for versioned libtorch_agnostic extensions.
 
-    This test class supports testing both:
+    This test class supports testing:
 
     - libtorch_agn_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
     - libtorch_agn_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
+    - libtorch_agn_2_11: Extension built with TORCH_TARGET_VERSION=2.11.0
 
     Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
     version that they target.
     """
 
     @classmethod
     def setUpClass(cls):
-        # Build both 2.9 and 2.10 extensions
+        # Build versioned extensions
         base_dir = Path(__file__).parent
 
         try:
@@ -101,7 +103,7 @@ def setUpClass(cls):
                 extension_root=base_dir / "libtorch_agn_2_9_extension"
             )
 
-        # Only build 2.10 extension if running on PyTorch 2.10+
+        # Only build 2.X extension if running on PyTorch 2.X+
         import re
 
         version_parts = torch.__version__.split(".")
@@ -119,6 +121,16 @@ def setUpClass(cls):
         else:
             print(f"Skipping 2.10 extension (running on PyTorch {torch.__version__})")
 
+        if (current_major > 2) or (current_major == 2 and current_minor >= 11):
+            try:
+                import libtorch_agn_2_11  # noqa: F401
+            except Exception:
+                install_cpp_extension(
+                    extension_root=base_dir / "libtorch_agn_2_11_extension"
+                )
+        else:
+            print(f"Skipping 2.11 extension (running on PyTorch {torch.__version__})")
+
     @onlyCPU
     def test_slow_sgd(self, device):
         import libtorch_agn_2_9 as libtorch_agnostic
@@ -1660,6 +1672,60 @@ def test_my_subtract(self, device):
         expected_broadcast = torch.subtract(a, c)
         self.assertEqual(result_broadcast, expected_broadcast)
 
+    @skipIfTorchVersionLessThan(2, 11)
+    @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+    def test_my_from_blob_with_deleter(self, device):
+        """Test for from_blob with custom deleter (2.11 feature)."""
+        import libtorch_agn_2_11 as libtorch_agnostic
+
+        libtorch_agnostic.ops.reset_deleter_call_count()
+        self.assertEqual(libtorch_agnostic.ops.get_deleter_call_count(), 0)
+
+        # We need an original tensor to create the tensor with from_blob.
+        original = torch.rand(2, 3, device=device, dtype=torch.float32)
+        blob_tensor = libtorch_agnostic.ops.my_from_blob_with_deleter(
+            original.data_ptr(),
+            original.size(),
+            original.stride(),
+            device,
+            torch.float32,
+        )
+
+        self.assertEqual(blob_tensor, original)
+        self.assertEqual(blob_tensor.data_ptr(), original.data_ptr())
+
+        self.assertEqual(libtorch_agnostic.ops.get_deleter_call_count(), 0)
+
+        del blob_tensor
+        gc.collect()
+
+        # Ensure the deleter was called. The original tensor still exists and
+        # can be used.
+        self.assertEqual(libtorch_agnostic.ops.get_deleter_call_count(), 1)
+        original += 1
+
+    @onlyCUDA
+    @skipIfTorchVersionLessThan(2, 11)
+    def test_my_from_blob_with_cuda_deleter_no_leak(self, device):
+        """Test that from_blob deleter properly frees cudaMalloc'd memory."""
+        import libtorch_agn_2_11 as libtorch_agnostic
+
+        torch.cuda.synchronize(device)
+        init_mem = torch.cuda.memory_allocated(device)
+        numel = 1024 * 1024  # 4 MB per tensor
+
+        for _ in range(10):
+            tensor = libtorch_agnostic.ops.my_from_blob_with_cuda_deleter(numel, device)
+            # Verify tensor was created correctly
+            self.assertEqual(tensor.numel(), numel)
+            self.assertEqual(tensor.device, torch.device(device))
+            del tensor
+            gc.collect()
+            torch.cuda.synchronize(device)
+
+            curr_mem = torch.cuda.memory_allocated(device)
+            self.assertEqual(curr_mem, init_mem)
+
 
 instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)