leetgpu-challenges/challenges/medium/55_max_2d_subarray_sum/challenge.py at bcb9b7f2ed83437d14048dc0ca08ff9bdb68fd6f · AlphaGPU/leetgpu-challenges · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import ctypes
from typing import Any, List, Dict
import torch
from core.challenge_base import ChallengeBase

class Challenge(ChallengeBase):
    def __init__(self):
        super().__init__(
            name="Max 2D Subarray Sum",
            atol=1e-05,
            rtol=1e-05,
            num_gpus=1,
            access_tier="free"
        )

    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
        # Validate input types and shapes
        assert input.shape == (N, N)
        assert output.shape == (1,)
        assert input.dtype == torch.int32
        assert output.dtype == torch.int32

        psum = input.cumsum(dim=0).cumsum(dim=1)
        padded = torch.zeros((N+1, N+1), dtype=torch.int32)
        padded[1:, 1:] = psum

        top_left = padded[:-window_size, :-window_size]
        top_right = padded[:-window_size, window_size:]
        bottom_left = padded[window_size:, :-window_size]
        bottom_right = padded[window_size:, window_size:]
        window_sums = bottom_right - top_right - bottom_left + top_left

        max_sum = torch.max(window_sums)
        output[0] = max_sum

    def get_solve_signature(self) -> Dict[str, Any]:
        return {
            "input": ctypes.POINTER(ctypes.c_int),
            "output": ctypes.POINTER(ctypes.c_int),
            "N": ctypes.c_int,
            "window_size": ctypes.c_int
        }

    def generate_example_test(self) -> Dict[str, Any]:
        dtype = torch.int32
        input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype)
        output = torch.empty(1, device="cuda", dtype=dtype)
        return {
            "input": input,
            "output": output,
            "N": 3,
            "window_size": 2
        }

    def generate_functional_test(self) -> List[Dict[str, Any]]:
        dtype = torch.int32
        tests = []

        # basic_example
        tests.append({
            "input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 3,
            "window_size": 2
        })

        # all_same_value
        tests.append({
            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 16,
            "window_size": 16
        })

        tests.append({
            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 16,
            "window_size": 15
        })

        tests.append({
            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 16,
            "window_size": 1
        })

        # all_minus_value
        tests.append({
            "input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 10,
            "window_size": 5
        })

        tests.append({
            "input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 123,
            "window_size": 7
        })

        # increasing_sequence
        tests.append({
            "input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 123,
            "window_size": 7
        })

        # medium_size
        tests.append({
            "input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 1000,
            "window_size": 476
        })

        # large_size
        tests.append({
            "input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype),
            "output": torch.empty(1, device="cuda", dtype=dtype),
            "N": 3000,
            "window_size": 2011
        })

        return tests

    def generate_performance_test(self) -> Dict[str, Any]:
        dtype = torch.int32
        input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype)
        output = torch.empty(1, device="cuda", dtype=dtype)
        return {
            "input": input,
            "output": output,
            "N": 5000,
            "window_size": 2500
        }