diff --git a/test/test_ops.py b/test/test_ops.py index 9521f21a815..4686fecf35c 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -643,6 +643,43 @@ def test_performance_mps(self): execution_time_ms < execution_time_ms_threshold ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms" + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_roi_align_large_index(self, device): + """Non-regression test for https://github.com/pytorch/vision/issues/8206""" + if device == "cpu": + pytest.skip("Too slow on CPU") + + pooled_h, pooled_w = 7, 7 + channels = 4 + # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX + n_rois = 11_000_000 + num_imgs = 2 + height, width = 4, 4 + spatial_scale = 1.0 + sampling_ratio = 2 + + x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device, requires_grad=True) + rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device) + + rois[:, 0] = torch.randint(0, num_imgs, (n_rois,)) + rois[:, 1] = 0 + rois[:, 2] = 0 + rois[:, 3] = width - 1 + rois[:, 4] = height - 1 + + # Call the C++ kernel directly, in case that torchvision.ops.roi_align may fall + # back to a pure-Python path that doesn't have the int32 overflow bug. + result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False) + + # Forward kernel test + assert result.shape == (n_rois, channels, pooled_h, pooled_w) + assert result.abs().sum() > 0 + + # Backward kernel test + result.sum().backward() + assert x.grad is not None + assert x.grad.abs().sum() > 0 + class TestPSRoIAlign(RoIOpTester): mps_backward_atol = 5e-2 diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index e0185da45df..c1e159a317c 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -26,7 +26,8 @@ void roi_align_forward_kernel_impl( // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { - int index_n = n * channels * pooled_width * pooled_height; + int64_t index_n = + static_cast(n) * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; @@ -78,14 +79,14 @@ void roi_align_forward_kernel_impl( pre_calc); for (int c = 0; c < channels; c++) { - int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_input = - input + (roi_batch_ind * channels + c) * height * width; + int64_t index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_input = input + + (static_cast(roi_batch_ind) * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { - int index = index_n_c + ph * pooled_width + pw; + int64_t index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { @@ -175,7 +176,7 @@ inline void add(T* address, const T& val) { template void roi_align_backward_kernel_impl( - int nthreads, + int64_t nthreads, const T* grad_output, const T& spatial_scale, int channels, @@ -191,7 +192,7 @@ void roi_align_backward_kernel_impl( int c_stride, int h_stride, int w_stride) { - for (int index = 0; index < nthreads; index++) { + for (int64_t index = 0; index < nthreads; index++) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -219,10 +220,10 @@ void roi_align_backward_kernel_impl( T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - T* offset_grad_input = - grad_input + ((roi_batch_ind * channels + c) * height * width); + T* offset_grad_input = grad_input + + ((static_cast(roi_batch_ind) * channels + c) * height * width); - int output_offset = n * n_stride + c * c_stride; + int64_t output_offset = static_cast(n) * n_stride + c * c_stride; const T* offset_grad_output = grad_output + output_offset; const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride]; diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu index 26c53448663..b0baef66daa 100644 --- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu +++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu @@ -67,7 +67,7 @@ __device__ T bilinear_interpolate( template __global__ void roi_align_forward_kernel_impl( - int nthreads, + int64_t nthreads, const T* input, const T spatial_scale, int channels, @@ -79,7 +79,7 @@ __global__ void roi_align_forward_kernel_impl( bool aligned, const T* rois, T* output) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -107,8 +107,8 @@ __global__ void roi_align_forward_kernel_impl( T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - const T* offset_input = - input + (roi_batch_ind * channels + c) * height * width; + const T* offset_input = input + + (static_cast(roi_batch_ind) * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) @@ -203,7 +203,7 @@ __device__ void bilinear_interpolate_gradient( template __global__ void roi_align_backward_kernel_impl( - int nthreads, + int64_t nthreads, const T* grad_output, const T spatial_scale, int channels, @@ -219,8 +219,8 @@ __global__ void roi_align_backward_kernel_impl( int c_stride, int h_stride, int w_stride, - const int memory_span) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int64_t memory_span) { + CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -250,7 +250,8 @@ __global__ void roi_align_backward_kernel_impl( // We need to index the gradient using the tensor strides to access the // correct values. - const int output_offset = n * n_stride + c * c_stride; + const int64_t output_offset = + static_cast(n) * n_stride + c * c_stride; const T* offset_grad_output = grad_output + output_offset; const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride]; @@ -265,7 +266,8 @@ __global__ void roi_align_backward_kernel_impl( // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 - const int input_offset = (roi_batch_ind * channels + c) * height * width; + const int64_t input_offset = + (static_cast(roi_batch_ind) * channels + c) * height * width; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 {