From 8c71ea8ba4d6547c2a8a86f4fc4e7f073e1df54d Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Fri, 13 Mar 2026 02:38:38 -0700 Subject: [PATCH 01/10] fix the issue 8206 and add the test --- test/test_ops.py | 57 +++++++++++++++++++ torchvision/csrc/ops/cpu/roi_align_common.h | 18 +++--- torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 38 +++++++------ torchvision/csrc/ops/cuda/roi_align_kernel.cu | 34 +++++------ 4 files changed, 104 insertions(+), 43 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 9521f21a815..35022bc1de7 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -643,6 +643,63 @@ def test_performance_mps(self): execution_time_ms < execution_time_ms_threshold ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms" + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + def test_roi_align_large_index(self, device): + """Regression test for https://github.com/pytorch/vision/issues/8206 + + roi_align used int (32-bit) for output index arithmetic. When + n_rois * channels * pooled_h * pooled_w > INT_MAX (~2.1 billion), + the index overflows, causing a segfault on CPU or silently wrong + results on CUDA. The fix promotes index variables to int64_t. + + This test calls the C++ kernel directly to ensure the native code + path is exercised (the pure-Python fallback doesn't have this bug). + + We use n_rois=11,000,000 with channels=4 and pooled 7x7 so that + the total output element count (11M * 4 * 49 = 2.156B) exceeds + INT_MAX (2,147,483,647). The input feature map is kept tiny (4x4) + so memory is dominated by the ~8.6 GB output tensor. + """ + pooled_h, pooled_w = 7, 7 + channels = 4 + # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX + n_rois = 11_000_000 + num_imgs = 2 + height, width = 4, 4 + spatial_scale = 1.0 + sampling_ratio = 2 + + # Output is ~8.6 GB; skip if not enough memory + output_bytes = n_rois * channels * pooled_h * pooled_w * 4 # float32 + if output_bytes > 9 * 1024**3: + pytest.skip("Test requires ~9 GB of memory") + + try: + x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device) + rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device) + except RuntimeError: + pytest.skip("Not enough memory to allocate test tensors") + + rois[:, 0] = torch.randint(0, num_imgs, (n_rois,)) + rois[:, 1] = 0 + rois[:, 2] = 0 + rois[:, 3] = width - 1 + rois[:, 4] = height - 1 + + # Use torch.ops.torchvision.roi_align directly instead of + # torchvision.ops.roi_align, because the latter falls back to a + # pure-Python implementation when C++ extensions are not loaded. + # The pure-Python path uses PyTorch's native int64 tensor indexing + # and would never trigger the int32 overflow. We need to test the + # C++ kernel specifically. + try: + result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) + except RuntimeError: + pytest.skip("Not enough memory for roi_align output") + + assert result.shape == (n_rois, channels, pooled_h, pooled_w) + assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug" + class TestPSRoIAlign(RoIOpTester): mps_backward_atol = 5e-2 diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index e10c67b5b79..cb5c0deb658 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -8,10 +8,10 @@ namespace detail { template struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; + int64_t pos1; + int64_t pos2; + int64_t pos3; + int64_t pos4; T w1; T w2; T w3; @@ -42,7 +42,7 @@ void pre_calc_for_bilinear_interpolate( int roi_bin_grid_h, int roi_bin_grid_w, std::vector>& pre_calc) { - int pre_calc_index = 0; + int64_t pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < roi_bin_grid_h; iy++) { @@ -106,10 +106,10 @@ void pre_calc_for_bilinear_interpolate( // save weights and indices PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; + pc.pos1 = static_cast(y_low) * width + x_low; + pc.pos2 = static_cast(y_low) * width + x_high; + pc.pos3 = static_cast(y_high) * width + x_low; + pc.pos4 = static_cast(y_high) * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index e0185da45df..39f670d8112 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -26,7 +26,8 @@ void roi_align_forward_kernel_impl( // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { - int index_n = n * channels * pooled_width * pooled_height; + int64_t index_n = + static_cast(n) * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; @@ -78,14 +79,15 @@ void roi_align_forward_kernel_impl( pre_calc); for (int c = 0; c < channels; c++) { - int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_input = - input + (roi_batch_ind * channels + c) * height * width; + int64_t index_n_c = + index_n + static_cast(c) * pooled_width * pooled_height; + const T* offset_input = input + + (static_cast(roi_batch_ind) * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { - int index = index_n_c + ph * pooled_width + pw; + int64_t index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { @@ -175,7 +177,7 @@ inline void add(T* address, const T& val) { template void roi_align_backward_kernel_impl( - int nthreads, + int64_t nthreads, const T* grad_output, const T& spatial_scale, int channels, @@ -187,11 +189,11 @@ void roi_align_backward_kernel_impl( bool aligned, T* grad_input, const T* rois, - int n_stride, - int c_stride, - int h_stride, - int w_stride) { - for (int index = 0; index < nthreads; index++) { + int64_t n_stride, + int64_t c_stride, + int64_t h_stride, + int64_t w_stride) { + for (int64_t index = 0; index < nthreads; index++) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -219,10 +221,10 @@ void roi_align_backward_kernel_impl( T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - T* offset_grad_input = - grad_input + ((roi_batch_ind * channels + c) * height * width); + T* offset_grad_input = grad_input + + ((static_cast(roi_batch_ind) * channels + c) * height * width); - int output_offset = n * n_stride + c * c_stride; + int64_t output_offset = static_cast(n) * n_stride + c * c_stride; const T* offset_grad_output = grad_output + output_offset; const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride]; @@ -359,10 +361,10 @@ at::Tensor roi_align_backward_kernel( } // get stride values to ensure indexing into gradients is correct. - int n_stride = grad.stride(0); - int c_stride = grad.stride(1); - int h_stride = grad.stride(2); - int w_stride = grad.stride(3); + int64_t n_stride = grad.stride(0); + int64_t c_stride = grad.stride(1); + int64_t h_stride = grad.stride(2); + int64_t w_stride = grad.stride(3); auto rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu index 26c53448663..414378663af 100644 --- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu +++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu @@ -67,7 +67,7 @@ __device__ T bilinear_interpolate( template __global__ void roi_align_forward_kernel_impl( - int nthreads, + int64_t nthreads, const T* input, const T spatial_scale, int channels, @@ -79,7 +79,7 @@ __global__ void roi_align_forward_kernel_impl( bool aligned, const T* rois, T* output) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -107,8 +107,8 @@ __global__ void roi_align_forward_kernel_impl( T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - const T* offset_input = - input + (roi_batch_ind * channels + c) * height * width; + const T* offset_input = input + + (static_cast(roi_batch_ind) * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) @@ -203,7 +203,7 @@ __device__ void bilinear_interpolate_gradient( template __global__ void roi_align_backward_kernel_impl( - int nthreads, + int64_t nthreads, const T* grad_output, const T spatial_scale, int channels, @@ -215,12 +215,12 @@ __global__ void roi_align_backward_kernel_impl( bool aligned, T* grad_input, const T* rois, - int n_stride, - int c_stride, - int h_stride, - int w_stride, + int64_t n_stride, + int64_t c_stride, + int64_t h_stride, + int64_t w_stride, const int memory_span) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -250,7 +250,8 @@ __global__ void roi_align_backward_kernel_impl( // We need to index the gradient using the tensor strides to access the // correct values. - const int output_offset = n * n_stride + c * c_stride; + const int64_t output_offset = + static_cast(n) * n_stride + c * c_stride; const T* offset_grad_output = grad_output + output_offset; const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride]; @@ -265,7 +266,8 @@ __global__ void roi_align_backward_kernel_impl( // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 - const int input_offset = (roi_batch_ind * channels + c) * height * width; + const int64_t input_offset = + (static_cast(roi_batch_ind) * channels + c) * height * width; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { @@ -432,10 +434,10 @@ at::Tensor roi_align_backward_kernel( return grad_input; } - int n_stride = grad.stride(0); - int c_stride = grad.stride(1); - int h_stride = grad.stride(2); - int w_stride = grad.stride(3); + int64_t n_stride = grad.stride(0); + int64_t c_stride = grad.stride(1); + int64_t h_stride = grad.stride(2); + int64_t w_stride = grad.stride(3); at::globalContext().alertNotDeterministic("roi_align_backward_kernel"); From 40b2276c08b078e54cd68e90f7ad42ae0441a0ed Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Fri, 13 Mar 2026 02:44:51 -0700 Subject: [PATCH 02/10] fix the issue 8206 and add the test --- torchvision/csrc/ops/cuda/roi_align_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu index 414378663af..e9fbf4060f2 100644 --- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu +++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu @@ -219,7 +219,7 @@ __global__ void roi_align_backward_kernel_impl( int64_t c_stride, int64_t h_stride, int64_t w_stride, - const int memory_span) { + const int64_t memory_span) { CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; From d9ab5cede68a0a0ea2c43f670ebd5549f205dcdd Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Fri, 13 Mar 2026 02:54:24 -0700 Subject: [PATCH 03/10] remove unnecessary comments --- test/test_ops.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 35022bc1de7..1bd4e73f0b3 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -645,21 +645,7 @@ def test_performance_mps(self): @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) def test_roi_align_large_index(self, device): - """Regression test for https://github.com/pytorch/vision/issues/8206 - - roi_align used int (32-bit) for output index arithmetic. When - n_rois * channels * pooled_h * pooled_w > INT_MAX (~2.1 billion), - the index overflows, causing a segfault on CPU or silently wrong - results on CUDA. The fix promotes index variables to int64_t. - - This test calls the C++ kernel directly to ensure the native code - path is exercised (the pure-Python fallback doesn't have this bug). - - We use n_rois=11,000,000 with channels=4 and pooled 7x7 so that - the total output element count (11M * 4 * 49 = 2.156B) exceeds - INT_MAX (2,147,483,647). The input feature map is kept tiny (4x4) - so memory is dominated by the ~8.6 GB output tensor. - """ + """Regression test for https://github.com/pytorch/vision/issues/8206""" pooled_h, pooled_w = 7, 7 channels = 4 # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX @@ -669,7 +655,6 @@ def test_roi_align_large_index(self, device): spatial_scale = 1.0 sampling_ratio = 2 - # Output is ~8.6 GB; skip if not enough memory output_bytes = n_rois * channels * pooled_h * pooled_w * 4 # float32 if output_bytes > 9 * 1024**3: pytest.skip("Test requires ~9 GB of memory") @@ -686,12 +671,6 @@ def test_roi_align_large_index(self, device): rois[:, 3] = width - 1 rois[:, 4] = height - 1 - # Use torch.ops.torchvision.roi_align directly instead of - # torchvision.ops.roi_align, because the latter falls back to a - # pure-Python implementation when C++ extensions are not loaded. - # The pure-Python path uses PyTorch's native int64 tensor indexing - # and would never trigger the int32 overflow. We need to test the - # C++ kernel specifically. try: result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) except RuntimeError: From 544d960edd7e699f7d2ddebca7dbb0c833570e1b Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 17 Mar 2026 02:11:47 -0700 Subject: [PATCH 04/10] address the comments --- test/test_ops.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 1bd4e73f0b3..e11b94545cf 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -655,15 +655,8 @@ def test_roi_align_large_index(self, device): spatial_scale = 1.0 sampling_ratio = 2 - output_bytes = n_rois * channels * pooled_h * pooled_w * 4 # float32 - if output_bytes > 9 * 1024**3: - pytest.skip("Test requires ~9 GB of memory") - - try: - x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device) - rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device) - except RuntimeError: - pytest.skip("Not enough memory to allocate test tensors") + x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device) + rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device) rois[:, 0] = torch.randint(0, num_imgs, (n_rois,)) rois[:, 1] = 0 @@ -671,10 +664,9 @@ def test_roi_align_large_index(self, device): rois[:, 3] = width - 1 rois[:, 4] = height - 1 - try: - result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) - except RuntimeError: - pytest.skip("Not enough memory for roi_align output") + # Call the C++ kernel directly, in case that torchvision.ops.roi_align may fall + # back to a pure-Python path that doesn't have the int32 overflow bug. + result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) assert result.shape == (n_rois, channels, pooled_h, pooled_w) assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug" From 76f6a163e62977b3381c4895abf5064a83362d48 Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 17 Mar 2026 10:53:32 -0700 Subject: [PATCH 05/10] add the backward kernel test --- test/test_ops.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index e11b94545cf..21c13a8564a 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -655,7 +655,7 @@ def test_roi_align_large_index(self, device): spatial_scale = 1.0 sampling_ratio = 2 - x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device) + x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device, requires_grad=True) rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device) rois[:, 0] = torch.randint(0, num_imgs, (n_rois,)) @@ -668,9 +668,15 @@ def test_roi_align_large_index(self, device): # back to a pure-Python path that doesn't have the int32 overflow bug. result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) + # Forward kernel test assert result.shape == (n_rois, channels, pooled_h, pooled_w) assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug" + # Backward kernel test + result.sum().backward() + assert x.grad is not None, "x.grad is None — backward was not executed" + assert x.grad.abs().sum() > 0, "x.grad is all zeros — likely an index overflow bug in the backward kernel" + class TestPSRoIAlign(RoIOpTester): mps_backward_atol = 5e-2 From b6b7ab757be5a10656c190306920a48d10ef21a3 Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Fri, 20 Mar 2026 01:26:27 -0700 Subject: [PATCH 06/10] address the test failure --- test/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index 21c13a8564a..106a993acd6 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -643,7 +643,7 @@ def test_performance_mps(self): execution_time_ms < execution_time_ms_threshold ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms" - @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_roi_align_large_index(self, device): """Regression test for https://github.com/pytorch/vision/issues/8206""" pooled_h, pooled_w = 7, 7 From 8ff88ed263c03abfddae2bfe1c1b9af8badb790f Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 24 Mar 2026 02:48:24 -0700 Subject: [PATCH 07/10] address the comments --- test/test_ops.py | 12 ++++++------ torchvision/csrc/ops/cpu/roi_align_common.h | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 106a993acd6..497308c0c87 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -643,9 +643,9 @@ def test_performance_mps(self): execution_time_ms < execution_time_ms_threshold ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms" - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_roi_align_large_index(self, device): - """Regression test for https://github.com/pytorch/vision/issues/8206""" + @needs_cuda + def test_roi_align_large_index(self, device="cuda"): + """Non-regression test for https://github.com/pytorch/vision/issues/8206""" pooled_h, pooled_w = 7, 7 channels = 4 # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX @@ -670,12 +670,12 @@ def test_roi_align_large_index(self, device): # Forward kernel test assert result.shape == (n_rois, channels, pooled_h, pooled_w) - assert result.abs().sum() > 0, "roi_align returned all zeros — likely an index overflow bug" + assert result.abs().sum() > 0 # Backward kernel test result.sum().backward() - assert x.grad is not None, "x.grad is None — backward was not executed" - assert x.grad.abs().sum() > 0, "x.grad is all zeros — likely an index overflow bug in the backward kernel" + assert x.grad is not None + assert x.grad.abs().sum() > 0 class TestPSRoIAlign(RoIOpTester): diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index cb5c0deb658..e10c67b5b79 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -8,10 +8,10 @@ namespace detail { template struct PreCalc { - int64_t pos1; - int64_t pos2; - int64_t pos3; - int64_t pos4; + int pos1; + int pos2; + int pos3; + int pos4; T w1; T w2; T w3; @@ -42,7 +42,7 @@ void pre_calc_for_bilinear_interpolate( int roi_bin_grid_h, int roi_bin_grid_w, std::vector>& pre_calc) { - int64_t pre_calc_index = 0; + int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < roi_bin_grid_h; iy++) { @@ -106,10 +106,10 @@ void pre_calc_for_bilinear_interpolate( // save weights and indices PreCalc pc; - pc.pos1 = static_cast(y_low) * width + x_low; - pc.pos2 = static_cast(y_low) * width + x_high; - pc.pos3 = static_cast(y_high) * width + x_low; - pc.pos4 = static_cast(y_high) * width + x_high; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; From bad6abaaeebe7bd7a03e71ffadcb684339b8944d Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 24 Mar 2026 02:59:48 -0700 Subject: [PATCH 08/10] skip the cpu test --- test/test_ops.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 497308c0c87..4686fecf35c 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -643,9 +643,12 @@ def test_performance_mps(self): execution_time_ms < execution_time_ms_threshold ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms" - @needs_cuda - def test_roi_align_large_index(self, device="cuda"): + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_roi_align_large_index(self, device): """Non-regression test for https://github.com/pytorch/vision/issues/8206""" + if device == "cpu": + pytest.skip("Too slow on CPU") + pooled_h, pooled_w = 7, 7 channels = 4 # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX From 1d9d1d4615e93593e0bb1cd428ce81ebdcdde381 Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 24 Mar 2026 03:48:34 -0700 Subject: [PATCH 09/10] check the type and cast --- torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 11 +++++------ torchvision/csrc/ops/cuda/roi_align_kernel.cu | 8 ++++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index 39f670d8112..6775e61fe21 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -79,8 +79,7 @@ void roi_align_forward_kernel_impl( pre_calc); for (int c = 0; c < channels; c++) { - int64_t index_n_c = - index_n + static_cast(c) * pooled_width * pooled_height; + int64_t index_n_c = index_n + c * pooled_width * pooled_height; const T* offset_input = input + (static_cast(roi_batch_ind) * channels + c) * height * width; int pre_calc_index = 0; @@ -189,10 +188,10 @@ void roi_align_backward_kernel_impl( bool aligned, T* grad_input, const T* rois, - int64_t n_stride, - int64_t c_stride, - int64_t h_stride, - int64_t w_stride) { + int n_stride, + int c_stride, + int h_stride, + int w_stride) { for (int64_t index = 0; index < nthreads; index++) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu index e9fbf4060f2..dd9d04951bf 100644 --- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu +++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu @@ -215,10 +215,10 @@ __global__ void roi_align_backward_kernel_impl( bool aligned, T* grad_input, const T* rois, - int64_t n_stride, - int64_t c_stride, - int64_t h_stride, - int64_t w_stride, + int n_stride, + int c_stride, + int h_stride, + int w_stride, const int64_t memory_span) { CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output From f40a46d9947818ce57709c5d2c881542e5651a55 Mon Sep 17 00:00:00 2001 From: Zhitao Yu Date: Tue, 24 Mar 2026 03:52:40 -0700 Subject: [PATCH 10/10] remove the unnecessary changes and stick to the original codes --- torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 8 ++++---- torchvision/csrc/ops/cuda/roi_align_kernel.cu | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index 6775e61fe21..c1e159a317c 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -360,10 +360,10 @@ at::Tensor roi_align_backward_kernel( } // get stride values to ensure indexing into gradients is correct. - int64_t n_stride = grad.stride(0); - int64_t c_stride = grad.stride(1); - int64_t h_stride = grad.stride(2); - int64_t w_stride = grad.stride(3); + int n_stride = grad.stride(0); + int c_stride = grad.stride(1); + int h_stride = grad.stride(2); + int w_stride = grad.stride(3); auto rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu index dd9d04951bf..b0baef66daa 100644 --- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu +++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu @@ -434,10 +434,10 @@ at::Tensor roi_align_backward_kernel( return grad_input; } - int64_t n_stride = grad.stride(0); - int64_t c_stride = grad.stride(1); - int64_t h_stride = grad.stride(2); - int64_t w_stride = grad.stride(3); + int n_stride = grad.stride(0); + int c_stride = grad.stride(1); + int h_stride = grad.stride(2); + int w_stride = grad.stride(3); at::globalContext().alertNotDeterministic("roi_align_backward_kernel");