diff --git a/test/test_ops.py b/test/test_ops.py
index 9521f21a815..4686fecf35c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -643,6 +643,43 @@ def test_performance_mps(self):
             execution_time_ms < execution_time_ms_threshold
         ), f"Expected execution to take < {execution_time_ms_threshold} ms, actually took {execution_time_ms} ms"
 
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_roi_align_large_index(self, device):
+        """Non-regression test for https://github.com/pytorch/vision/issues/8206"""
+        if device == "cpu":
+            pytest.skip("Too slow on CPU")
+
+        pooled_h, pooled_w = 7, 7
+        channels = 4
+        # 11M * 4 * 7 * 7 = 2,156,000,000 > INT_MAX
+        n_rois = 11_000_000
+        num_imgs = 2
+        height, width = 4, 4
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        x = torch.rand(num_imgs, channels, height, width, dtype=torch.float32, device=device, requires_grad=True)
+        rois = torch.zeros(n_rois, 5, dtype=torch.float32, device=device)
+
+        rois[:, 0] = torch.randint(0, num_imgs, (n_rois,))
+        rois[:, 1] = 0
+        rois[:, 2] = 0
+        rois[:, 3] = width - 1
+        rois[:, 4] = height - 1
+
+        # Call the C++ kernel directly, in case that torchvision.ops.roi_align may fall
+        # back to a pure-Python path that doesn't have the int32 overflow bug.
+        result = torch.ops.torchvision.roi_align(x, rois, spatial_scale, pooled_h, pooled_w, sampling_ratio, False)
+
+        # Forward kernel test
+        assert result.shape == (n_rois, channels, pooled_h, pooled_w)
+        assert result.abs().sum() > 0
+
+        # Backward kernel test
+        result.sum().backward()
+        assert x.grad is not None
+        assert x.grad.abs().sum() > 0
+
 
 class TestPSRoIAlign(RoIOpTester):
     mps_backward_atol = 5e-2
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index e0185da45df..c1e159a317c 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -26,7 +26,8 @@ void roi_align_forward_kernel_impl(
   // can be parallelized using omp
   // #pragma omp parallel for num_threads(32)
   for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
+    int64_t index_n =
+        static_cast<int64_t>(n) * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = offset_rois[0];
@@ -78,14 +79,14 @@ void roi_align_forward_kernel_impl(
         pre_calc);
 
     for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
+      int64_t index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input = input +
+          (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
       int pre_calc_index = 0;
 
       for (int ph = 0; ph < pooled_height; ph++) {
         for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
+          int64_t index = index_n_c + ph * pooled_width + pw;
 
           T output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
@@ -175,7 +176,7 @@ inline void add(T* address, const T& val) {
 
 template <typename T>
 void roi_align_backward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* grad_output,
     const T& spatial_scale,
     int channels,
@@ -191,7 +192,7 @@ void roi_align_backward_kernel_impl(
     int c_stride,
     int h_stride,
     int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
+  for (int64_t index = 0; index < nthreads; index++) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -219,10 +220,10 @@ void roi_align_backward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input = grad_input +
+        ((static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width);
 
-    int output_offset = n * n_stride + c * c_stride;
+    int64_t output_offset = static_cast<int64_t>(n) * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index 26c53448663..b0baef66daa 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -67,7 +67,7 @@ __device__ T bilinear_interpolate(
 
 template <typename T>
 __global__ void roi_align_forward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* input,
     const T spatial_scale,
     int channels,
@@ -79,7 +79,7 @@ __global__ void roi_align_forward_kernel_impl(
     bool aligned,
     const T* rois,
     T* output) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -107,8 +107,8 @@ __global__ void roi_align_forward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    const T* offset_input =
-        input + (roi_batch_ind * channels + c) * height * width;
+    const T* offset_input = input +
+        (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
 
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
@@ -203,7 +203,7 @@ __device__ void bilinear_interpolate_gradient(
 
 template <typename T>
 __global__ void roi_align_backward_kernel_impl(
-    int nthreads,
+    int64_t nthreads,
     const T* grad_output,
     const T spatial_scale,
     int channels,
@@ -219,8 +219,8 @@ __global__ void roi_align_backward_kernel_impl(
     int c_stride,
     int h_stride,
     int w_stride,
-    const int memory_span) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int64_t memory_span) {
+  CUDA_1D_KERNEL_LOOP_T(index, nthreads, int64_t) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -250,7 +250,8 @@ __global__ void roi_align_backward_kernel_impl(
 
     // We need to index the gradient using the tensor strides to access the
     // correct values.
-    const int output_offset = n * n_stride + c * c_stride;
+    const int64_t output_offset =
+        static_cast<int64_t>(n) * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -265,7 +266,8 @@ __global__ void roi_align_backward_kernel_impl(
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
 
-    const int input_offset = (roi_batch_ind * channels + c) * height * width;
+    const int64_t input_offset =
+        (static_cast<int64_t>(roi_batch_ind) * channels + c) * height * width;
 
     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
     {