From 8c71ea8ba4d6547c2a8a86f4fc4e7f073e1df54d Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Fri, 13 Mar 2026 02:38:38 -0700
Subject: [PATCH 01/10] fix the issue 8206 and add the test

---
 test/test_ops.py                              | 57 +++++++++++++++++++
 torchvision/csrc/ops/cpu/roi_align_common.h   | 18 +++---
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 38 +++++++------
 torchvision/csrc/ops/cuda/roi_align_kernel.cu | 34 +++++------
 4 files changed, 104 insertions(+), 43 deletions(-)
diff --git a/test/test_ops.py b/test/test_ops.py
index 9521f21a815..35022bc1de7 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -643,6 +643,63 @@ def test_performance_mps(self):
             execution_time_ms < execution_time_ms_threshold
         ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms"
 
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    def test_roi_align_large_index(self, device):
+        """Regression test for https://github.com/pytorch/vision/issues/8206
+
+        roi_align used int (32-bit) for output index arithmetic. When
+        n_rois * channels * pooled_h * pooled_w > INT_MAX (~2.1 billion),
+        the index overflows, causing a segfault on CPU or silently wrong
+        results on CUDA. The fix promotes index variables to int64_t.
+
+        This test calls the C++ kernel directly to ensure the native code
+        path is exercised (the pure-Python fallback doesn't have this bug).
+
+        We use n_rois=11,000,000 with channels=4 and pooled 7x7 so that
+        the total output element count (11M * 4 * 49 = 2.156B) exceeds
+        INT_MAX (2,147,483,647). The input feature map is kept tiny (4x4)
+        so memory is dominated by the ~8.6 GB output tensor.
+        """
+        pooled_h, pooled_w = 7, 7
+        channels = 4
+        # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX
+        n_rois = 11_000_000
+        num_imgs = 2
+        height, width = 4, 4
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        # Output is ~8.6 GB; skip if not enough memory
+        output_bytes = n_rois * channels * pooled_h * pooled_w * 4  # float32
+        if output_bytes > 9 * 1024**3:
+            pytest.skip("Test requires ~9 GB of memory")
+
+        try:
+            x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device)
+            rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device)
+        except RuntimeError:
+            pytest.skip("Not enough memory to allocate test tensors")
+
+        rois[:, 0] = torch.randint(0, num_imgs, (n_rois,))
+        rois[:, 1] = 0
+        rois[:, 2] = 0
+        rois[:, 3] = width - 1
+        rois[:, 4] = height - 1
+
+        # Use torch.ops.torchvision.roi_align directly instead of
+        # torchvision.ops.roi_align, because the latter falls back to a
+        # pure-Python implementation when C++ extensions are not loaded.
+        # The pure-Python path uses PyTorch's native int64 tensor indexing
+        # and would never trigger the int32 overflow. We need to test the
+        # C++ kernel specifically.
+        try:
+            result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
+        except RuntimeError:
+            pytest.skip("Not enough memory for roi_align output")
+
+        assert result.shape == (n_rois, channels, pooled_h, pooled_w)
+        assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug"
+
 
 class TestPSRoIAlign(RoIOpTester):
     mps_backward_atol = 5e-2
diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index e10c67b5b79..cb5c0deb658 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -8,10 +8,10 @@ namespace detail {
 
 template <typename T>
 struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
+  int64_t pos1;
+  int64_t pos2;
+  int64_t pos3;
+  int64_t pos4;
   T w1;
   T w2;
   T w3;
@@ -42,7 +42,7 @@ void pre_calc_for_bilinear_interpolate(
     int roi_bin_grid_h,
     int roi_bin_grid_w,
     std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
+  int64_t pre_calc_index = 0;
   for (int ph = 0; ph < pooled_height; ph++) {
     for (int pw = 0; pw < pooled_width; pw++) {
       for (int iy = 0; iy < roi_bin_grid_h; iy++) {
@@ -106,10 +106,10 @@ void pre_calc_for_bilinear_interpolate(
 
           // save weights and indices
           PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
+          pc.pos1 = static_cast<int64_t>(y_low) * width + x_low;
+          pc.pos2 = static_cast<int64_t>(y_low) * width + x_high;
+          pc.pos3 = static_cast<int64_t>(y_high) * width + x_low;
+          pc.pos4 = static_cast<int64_t>(y_high) * width + x_high;
           pc.w1 = w1;
           pc.w2 = w2;
           pc.w3 = w3;
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index e0185da45df..39f670d8112 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -26,7 +26,8 @@ void roi_align_forward_kernel_impl(
   // can be parallelized using omp
   // #pragma omp parallel for num_threads(32)
   for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
+    int64_t index_n =
+        static_cast<int64_t>(n) * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = offset_rois[0];
@@ -78,14 +79,15 @@ void roi_align_forward_kernel_impl(
         pre_calc);
 
     for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
+      int64_t index_n_c =
+          index_n + static_cast<int64_t>(c) * pooled_width * pooled_height;
+      const T* offset_input = input +
+          (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
       int pre_calc_index = 0;
 
       for (int ph = 0; ph < pooled_height; ph++) {
         for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
+          int64_t index = index_n_c + ph * pooled_width + pw;
 
           T output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
@@ -175,7 +177,7 @@ inline void add(T* address, const T& val) {
 
 template <typename T>
 void roi_align_backward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* grad_output,
     const T& spatial_scale,
     int channels,
@@ -187,11 +189,11 @@ void roi_align_backward_kernel_impl(
     bool aligned,
     T* grad_input,
     const T* rois,
-    int n_stride,
-    int c_stride,
-    int h_stride,
-    int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
+    int64_t n_stride,
+    int64_t c_stride,
+    int64_t h_stride,
+    int64_t w_stride) {
+  for (int64_t index = 0; index < nthreads; index++) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -219,10 +221,10 @@ void roi_align_backward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input = grad_input +
+        ((static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width);
 
-    int output_offset = n * n_stride + c * c_stride;
+    int64_t output_offset = static_cast<int64_t>(n) * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -359,10 +361,10 @@ at::Tensor roi_align_backward_kernel(
   }
 
   // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad.stride(0);
-  int c_stride = grad.stride(1);
-  int h_stride = grad.stride(2);
-  int w_stride = grad.stride(3);
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
 
   auto rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index 26c53448663..414378663af 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -67,7 +67,7 @@ __device__ T bilinear_interpolate(
 
 template <typename T>
 __global__ void roi_align_forward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* input,
     const T spatial_scale,
     int channels,
@@ -79,7 +79,7 @@ __global__ void roi_align_forward_kernel_impl(
     bool aligned,
     const T* rois,
     T* output) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -107,8 +107,8 @@ __global__ void roi_align_forward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    const T* offset_input =
-        input + (roi_batch_ind * channels + c) * height * width;
+    const T* offset_input = input +
+        (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
 
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
@@ -203,7 +203,7 @@ __device__ void bilinear_interpolate_gradient(
 
 template <typename T>
 __global__ void roi_align_backward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* grad_output,
     const T spatial_scale,
     int channels,
@@ -215,12 +215,12 @@ __global__ void roi_align_backward_kernel_impl(
     bool aligned,
     T* grad_input,
     const T* rois,
-    int n_stride,
-    int c_stride,
-    int h_stride,
-    int w_stride,
+    int64_t n_stride,
+    int64_t c_stride,
+    int64_t h_stride,
+    int64_t w_stride,
     const int memory_span) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -250,7 +250,8 @@ __global__ void roi_align_backward_kernel_impl(
 
     // We need to index the gradient using the tensor strides to access the
     // correct values.
-    const int output_offset = n * n_stride + c * c_stride;
+    const int64_t output_offset =
+        static_cast<int64_t>(n) * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -265,7 +266,8 @@ __global__ void roi_align_backward_kernel_impl(
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
 
-    const int input_offset = (roi_batch_ind * channels + c) * height * width;
+    const int64_t input_offset =
+        (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
 
     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
     {
@@ -432,10 +434,10 @@ at::Tensor roi_align_backward_kernel(
     return grad_input;
   }
 
-  int n_stride = grad.stride(0);
-  int c_stride = grad.stride(1);
-  int h_stride = grad.stride(2);
-  int w_stride = grad.stride(3);
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
 
   at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
 

From 40b2276c08b078e54cd68e90f7ad42ae0441a0ed Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Fri, 13 Mar 2026 02:44:51 -0700
Subject: [PATCH 02/10] fix the issue 8206 and add the test

---
 torchvision/csrc/ops/cuda/roi_align_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index 414378663af..e9fbf4060f2 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -219,7 +219,7 @@ __global__ void roi_align_backward_kernel_impl(
     int64_t c_stride,
     int64_t h_stride,
     int64_t w_stride,
-    const int memory_span) {
+    const int64_t memory_span) {
   CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;

From d9ab5cede68a0a0ea2c43f670ebd5549f205dcdd Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Fri, 13 Mar 2026 02:54:24 -0700
Subject: [PATCH 03/10] remove unnecessary comments

---
 test/test_ops.py | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 35022bc1de7..1bd4e73f0b3 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -645,21 +645,7 @@ def test_performance_mps(self):
 
     @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     def test_roi_align_large_index(self, device):
-        """Regression test for https://github.com/pytorch/vision/issues/8206
-
-        roi_align used int (32-bit) for output index arithmetic. When
-        n_rois * channels * pooled_h * pooled_w > INT_MAX (~2.1 billion),
-        the index overflows, causing a segfault on CPU or silently wrong
-        results on CUDA. The fix promotes index variables to int64_t.
-
-        This test calls the C++ kernel directly to ensure the native code
-        path is exercised (the pure-Python fallback doesn't have this bug).
-
-        We use n_rois=11,000,000 with channels=4 and pooled 7x7 so that
-        the total output element count (11M * 4 * 49 = 2.156B) exceeds
-        INT_MAX (2,147,483,647). The input feature map is kept tiny (4x4)
-        so memory is dominated by the ~8.6 GB output tensor.
-        """
+        """Regression test for https://github.com/pytorch/vision/issues/8206"""
         pooled_h, pooled_w = 7, 7
         channels = 4
         # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX
@@ -669,7 +655,6 @@ def test_roi_align_large_index(self, device):
         spatial_scale = 1.0
         sampling_ratio = 2
 
-        # Output is ~8.6 GB; skip if not enough memory
         output_bytes = n_rois * channels * pooled_h * pooled_w * 4  # float32
         if output_bytes > 9 * 1024**3:
             pytest.skip("Test requires ~9 GB of memory")
@@ -686,12 +671,6 @@ def test_roi_align_large_index(self, device):
         rois[:, 3] = width - 1
         rois[:, 4] = height - 1
 
-        # Use torch.ops.torchvision.roi_align directly instead of
-        # torchvision.ops.roi_align, because the latter falls back to a
-        # pure-Python implementation when C++ extensions are not loaded.
-        # The pure-Python path uses PyTorch's native int64 tensor indexing
-        # and would never trigger the int32 overflow. We need to test the
-        # C++ kernel specifically.
         try:
             result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
         except RuntimeError:

From 544d960edd7e699f7d2ddebca7dbb0c833570e1b Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 17 Mar 2026 02:11:47 -0700
Subject: [PATCH 04/10] address the comments

---
 test/test_ops.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 1bd4e73f0b3..e11b94545cf 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -655,15 +655,8 @@ def test_roi_align_large_index(self, device):
         spatial_scale = 1.0
         sampling_ratio = 2
 
-        output_bytes = n_rois * channels * pooled_h * pooled_w * 4  # float32
-        if output_bytes > 9 * 1024**3:
-            pytest.skip("Test requires ~9 GB of memory")
-
-        try:
-            x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device)
-            rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device)
-        except RuntimeError:
-            pytest.skip("Not enough memory to allocate test tensors")
+        x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device)
+        rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device)
 
         rois[:, 0] = torch.randint(0, num_imgs, (n_rois,))
         rois[:, 1] = 0
@@ -671,10 +664,9 @@ def test_roi_align_large_index(self, device):
         rois[:, 3] = width - 1
         rois[:, 4] = height - 1
 
-        try:
-            result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
-        except RuntimeError:
-            pytest.skip("Not enough memory for roi_align output")
+        # Call the C++ kernel directly, in case that torchvision.ops.roi_align may fall
+        # back to a pure-Python path that doesn't have the int32 overflow bug.
+        result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
 
         assert result.shape == (n_rois, channels, pooled_h, pooled_w)
         assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug"

From 76f6a163e62977b3381c4895abf5064a83362d48 Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 17 Mar 2026 10:53:32 -0700
Subject: [PATCH 05/10] add the backward kernel test

---
 test/test_ops.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index e11b94545cf..21c13a8564a 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -655,7 +655,7 @@ def test_roi_align_large_index(self, device):
         spatial_scale = 1.0
         sampling_ratio = 2
 
-        x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device)
+        x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device, requires_grad=True)
         rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device)
 
         rois[:, 0] = torch.randint(0, num_imgs, (n_rois,))
@@ -668,9 +668,15 @@ def test_roi_align_large_index(self, device):
         # back to a pure-Python path that doesn't have the int32 overflow bug.
         result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
 
+        # Forward kernel test
         assert result.shape == (n_rois, channels, pooled_h, pooled_w)
         assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug"
 
+        # Backward kernel test
+        result.sum().backward()
+        assert x.grad is not None, "x.grad is None — backward was not executed"
+        assert x.grad.abs().sum() > 0, "x.grad is all zeros — likely an index overflow bug in the backward kernel"
+
 
 class TestPSRoIAlign(RoIOpTester):
     mps_backward_atol = 5e-2

From b6b7ab757be5a10656c190306920a48d10ef21a3 Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Fri, 20 Mar 2026 01:26:27 -0700
Subject: [PATCH 06/10] address the test failure

---
 test/test_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 21c13a8564a..106a993acd6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -643,7 +643,7 @@ def test_performance_mps(self):
             execution_time_ms < execution_time_ms_threshold
         ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms"
 
-    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_roi_align_large_index(self, device):
         """Regression test for https://github.com/pytorch/vision/issues/8206"""
         pooled_h, pooled_w = 7, 7

From 8ff88ed263c03abfddae2bfe1c1b9af8badb790f Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 24 Mar 2026 02:48:24 -0700
Subject: [PATCH 07/10] address the comments

---
 test/test_ops.py                            | 12 ++++++------
 torchvision/csrc/ops/cpu/roi_align_common.h | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 106a993acd6..497308c0c87 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -643,9 +643,9 @@ def test_performance_mps(self):
             execution_time_ms < execution_time_ms_threshold
         ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms"
 
-    @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_roi_align_large_index(self, device):
-        """Regression test for https://github.com/pytorch/vision/issues/8206"""
+    @needs_cuda
+    def test_roi_align_large_index(self, device="cuda"):
+        """Non-regression test for https://github.com/pytorch/vision/issues/8206"""
         pooled_h, pooled_w = 7, 7
         channels = 4
         # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX
@@ -670,12 +670,12 @@ def test_roi_align_large_index(self, device):
 
         # Forward kernel test
         assert result.shape == (n_rois, channels, pooled_h, pooled_w)
-        assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug"
+        assert result.abs().sum() > 0
 
         # Backward kernel test
         result.sum().backward()
-        assert x.grad is not None, "x.grad is None — backward was not executed"
-        assert x.grad.abs().sum() > 0, "x.grad is all zeros — likely an index overflow bug in the backward kernel"
+        assert x.grad is not None
+        assert x.grad.abs().sum() > 0
 
 
 class TestPSRoIAlign(RoIOpTester):
diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index cb5c0deb658..e10c67b5b79 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -8,10 +8,10 @@ namespace detail {
 
 template <typename T>
 struct PreCalc {
-  int64_t pos1;
-  int64_t pos2;
-  int64_t pos3;
-  int64_t pos4;
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
   T w1;
   T w2;
   T w3;
@@ -42,7 +42,7 @@ void pre_calc_for_bilinear_interpolate(
     int roi_bin_grid_h,
     int roi_bin_grid_w,
     std::vector<PreCalc<T>>& pre_calc) {
-  int64_t pre_calc_index = 0;
+  int pre_calc_index = 0;
   for (int ph = 0; ph < pooled_height; ph++) {
     for (int pw = 0; pw < pooled_width; pw++) {
       for (int iy = 0; iy < roi_bin_grid_h; iy++) {
@@ -106,10 +106,10 @@ void pre_calc_for_bilinear_interpolate(
 
           // save weights and indices
           PreCalc<T> pc;
-          pc.pos1 = static_cast<int64_t>(y_low) * width + x_low;
-          pc.pos2 = static_cast<int64_t>(y_low) * width + x_high;
-          pc.pos3 = static_cast<int64_t>(y_high) * width + x_low;
-          pc.pos4 = static_cast<int64_t>(y_high) * width + x_high;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
           pc.w1 = w1;
           pc.w2 = w2;
           pc.w3 = w3;

From bad6abaaeebe7bd7a03e71ffadcb684339b8944d Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 24 Mar 2026 02:59:48 -0700
Subject: [PATCH 08/10] skip the cpu test

---
 test/test_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 497308c0c87..4686fecf35c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -643,9 +643,12 @@ def test_performance_mps(self):
             execution_time_ms < execution_time_ms_threshold
         ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms"
 
-    @needs_cuda
-    def test_roi_align_large_index(self, device="cuda"):
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_roi_align_large_index(self, device):
         """Non-regression test for https://github.com/pytorch/vision/issues/8206"""
+        if device == "cpu":
+            pytest.skip("Too slow on CPU")
+
         pooled_h, pooled_w = 7, 7
         channels = 4
         # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX

From 1d9d1d4615e93593e0bb1cd428ce81ebdcdde381 Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 24 Mar 2026 03:48:34 -0700
Subject: [PATCH 09/10] check the type and cast

---
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 11 +++++------
 torchvision/csrc/ops/cuda/roi_align_kernel.cu |  8 ++++----
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index 39f670d8112..6775e61fe21 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -79,8 +79,7 @@ void roi_align_forward_kernel_impl(
         pre_calc);
 
     for (int c = 0; c < channels; c++) {
-      int64_t index_n_c =
-          index_n + static_cast<int64_t>(c) * pooled_width * pooled_height;
+      int64_t index_n_c = index_n + c * pooled_width * pooled_height;
       const T* offset_input = input +
           (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
       int pre_calc_index = 0;
@@ -189,10 +188,10 @@ void roi_align_backward_kernel_impl(
     bool aligned,
     T* grad_input,
     const T* rois,
-    int64_t n_stride,
-    int64_t c_stride,
-    int64_t h_stride,
-    int64_t w_stride) {
+    int n_stride,
+    int c_stride,
+    int h_stride,
+    int w_stride) {
   for (int64_t index = 0; index < nthreads; index++) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index e9fbf4060f2..dd9d04951bf 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -215,10 +215,10 @@ __global__ void roi_align_backward_kernel_impl(
     bool aligned,
     T* grad_input,
     const T* rois,
-    int64_t n_stride,
-    int64_t c_stride,
-    int64_t h_stride,
-    int64_t w_stride,
+    int n_stride,
+    int c_stride,
+    int h_stride,
+    int w_stride,
     const int64_t memory_span) {
   CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output

From f40a46d9947818ce57709c5d2c881542e5651a55 Mon Sep 17 00:00:00 2001
From: Zhitao Yu <zhitao@fb.com>
Date: Tue, 24 Mar 2026 03:52:40 -0700
Subject: [PATCH 10/10] remove the unnecessary changes and stick to the
 original codes

---
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 8 ++++----
 torchvision/csrc/ops/cuda/roi_align_kernel.cu | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index 6775e61fe21..c1e159a317c 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -360,10 +360,10 @@ at::Tensor roi_align_backward_kernel(
   }
 
   // get stride values to ensure indexing into gradients is correct.
-  int64_t n_stride = grad.stride(0);
-  int64_t c_stride = grad.stride(1);
-  int64_t h_stride = grad.stride(2);
-  int64_t w_stride = grad.stride(3);
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
 
   auto rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index dd9d04951bf..b0baef66daa 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -434,10 +434,10 @@ at::Tensor roi_align_backward_kernel(
     return grad_input;
   }
 
-  int64_t n_stride = grad.stride(0);
-  int64_t c_stride = grad.stride(1);
-  int64_t h_stride = grad.stride(2);
-  int64_t w_stride = grad.stride(3);
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
 
   at::globalContext().alertNotDeterministic("roi_align_backward_kernel");