|
| 1 | +import ctypes |
| 2 | +from typing import Any, Dict, List |
| 3 | + |
| 4 | +import torch |
| 5 | +from core.challenge_base import ChallengeBase |
| 6 | + |
| 7 | + |
| 8 | +class Challenge(ChallengeBase): |
| 9 | + def __init__(self): |
| 10 | + super().__init__( |
| 11 | + name="LoRA Linear", |
| 12 | + atol=1e-04, |
| 13 | + rtol=1e-04, |
| 14 | + num_gpus=1, |
| 15 | + access_tier="free", |
| 16 | + ) |
| 17 | + |
| 18 | + def reference_impl( |
| 19 | + self, |
| 20 | + x: torch.Tensor, |
| 21 | + W: torch.Tensor, |
| 22 | + A: torch.Tensor, |
| 23 | + B: torch.Tensor, |
| 24 | + output: torch.Tensor, |
| 25 | + batch: int, |
| 26 | + d_in: int, |
| 27 | + d_out: int, |
| 28 | + rank: int, |
| 29 | + lora_scale: float, |
| 30 | + ): |
| 31 | + assert x.shape == (batch, d_in) |
| 32 | + assert W.shape == (d_out, d_in) |
| 33 | + assert A.shape == (rank, d_in) |
| 34 | + assert B.shape == (d_out, rank) |
| 35 | + assert output.shape == (batch, d_out) |
| 36 | + assert x.dtype == W.dtype == A.dtype == B.dtype == output.dtype == torch.float32 |
| 37 | + assert x.device.type == "cuda" |
| 38 | + assert W.device.type == "cuda" |
| 39 | + assert A.device.type == "cuda" |
| 40 | + assert B.device.type == "cuda" |
| 41 | + assert output.device.type == "cuda" |
| 42 | + |
| 43 | + # Base linear: output = x @ W^T |
| 44 | + base = torch.mm(x, W.t()) |
| 45 | + |
| 46 | + # LoRA path: delta = lora_scale * (x @ A^T) @ B^T |
| 47 | + lora_hidden = torch.mm(x, A.t()) # (batch, rank) |
| 48 | + delta = torch.mm(lora_hidden, B.t()) # (batch, d_out) |
| 49 | + |
| 50 | + output.copy_(base + lora_scale * delta) |
| 51 | + |
| 52 | + def get_solve_signature(self) -> Dict[str, tuple]: |
| 53 | + return { |
| 54 | + "x": (ctypes.POINTER(ctypes.c_float), "in"), |
| 55 | + "W": (ctypes.POINTER(ctypes.c_float), "in"), |
| 56 | + "A": (ctypes.POINTER(ctypes.c_float), "in"), |
| 57 | + "B": (ctypes.POINTER(ctypes.c_float), "in"), |
| 58 | + "output": (ctypes.POINTER(ctypes.c_float), "out"), |
| 59 | + "batch": (ctypes.c_int, "in"), |
| 60 | + "d_in": (ctypes.c_int, "in"), |
| 61 | + "d_out": (ctypes.c_int, "in"), |
| 62 | + "rank": (ctypes.c_int, "in"), |
| 63 | + "lora_scale": (ctypes.c_float, "in"), |
| 64 | + } |
| 65 | + |
| 66 | + def _make_test_case(self, batch, d_in, d_out, rank, lora_scale=0.5, zero_x=False): |
| 67 | + dtype = torch.float32 |
| 68 | + device = "cuda" |
| 69 | + if zero_x: |
| 70 | + x = torch.zeros(batch, d_in, device=device, dtype=dtype) |
| 71 | + else: |
| 72 | + x = torch.randn(batch, d_in, device=device, dtype=dtype) |
| 73 | + W = torch.randn(d_out, d_in, device=device, dtype=dtype) * 0.02 |
| 74 | + A = torch.randn(rank, d_in, device=device, dtype=dtype) * 0.02 |
| 75 | + B = torch.zeros(d_out, rank, device=device, dtype=dtype) |
| 76 | + output = torch.zeros(batch, d_out, device=device, dtype=dtype) |
| 77 | + return { |
| 78 | + "x": x, |
| 79 | + "W": W, |
| 80 | + "A": A, |
| 81 | + "B": B, |
| 82 | + "output": output, |
| 83 | + "batch": batch, |
| 84 | + "d_in": d_in, |
| 85 | + "d_out": d_out, |
| 86 | + "rank": rank, |
| 87 | + "lora_scale": lora_scale, |
| 88 | + } |
| 89 | + |
| 90 | + def generate_example_test(self) -> Dict[str, Any]: |
| 91 | + dtype = torch.float32 |
| 92 | + device = "cuda" |
| 93 | + x = torch.tensor([[1.0, 0.0, -1.0, 2.0], [0.0, 1.0, 1.0, -1.0]], device=device, dtype=dtype) |
| 94 | + W = torch.tensor( |
| 95 | + [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]], |
| 96 | + device=device, |
| 97 | + dtype=dtype, |
| 98 | + ) |
| 99 | + A = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=device, dtype=dtype) |
| 100 | + B = torch.tensor( |
| 101 | + [[1.0, 0.0], [0.0, 1.0], [0.0, 0.0]], |
| 102 | + device=device, |
| 103 | + dtype=dtype, |
| 104 | + ) |
| 105 | + output = torch.zeros(2, 3, device=device, dtype=dtype) |
| 106 | + return { |
| 107 | + "x": x, |
| 108 | + "W": W, |
| 109 | + "A": A, |
| 110 | + "B": B, |
| 111 | + "output": output, |
| 112 | + "batch": 2, |
| 113 | + "d_in": 4, |
| 114 | + "d_out": 3, |
| 115 | + "rank": 2, |
| 116 | + "lora_scale": 0.5, |
| 117 | + } |
| 118 | + |
| 119 | + def generate_functional_test(self) -> List[Dict[str, Any]]: |
| 120 | + torch.manual_seed(42) |
| 121 | + tests = [] |
| 122 | + |
| 123 | + # Edge case: batch=1, tiny dimensions |
| 124 | + tests.append(self._make_test_case(1, 4, 4, 1)) |
| 125 | + |
| 126 | + # Edge case: zero input |
| 127 | + tests.append(self._make_test_case(2, 8, 8, 2, zero_x=True)) |
| 128 | + |
| 129 | + # Edge case: rank=1 (minimum LoRA rank) |
| 130 | + tests.append(self._make_test_case(4, 16, 16, 1)) |
| 131 | + |
| 132 | + # Power-of-2 dimensions |
| 133 | + tests.append(self._make_test_case(16, 64, 64, 8)) |
| 134 | + |
| 135 | + # Power-of-2, non-square |
| 136 | + tests.append(self._make_test_case(32, 128, 64, 16)) |
| 137 | + |
| 138 | + # Non-power-of-2 dimensions |
| 139 | + tests.append(self._make_test_case(30, 100, 100, 4)) |
| 140 | + |
| 141 | + # Non-power-of-2, mixed |
| 142 | + tests.append(self._make_test_case(7, 255, 128, 8)) |
| 143 | + |
| 144 | + # Realistic small: LLM feed-forward style |
| 145 | + tests.append(self._make_test_case(64, 512, 512, 16, lora_scale=0.125)) |
| 146 | + |
| 147 | + # Negative inputs |
| 148 | + tests.append( |
| 149 | + { |
| 150 | + "x": torch.full((4, 32), -1.0, device="cuda", dtype=torch.float32), |
| 151 | + "W": torch.randn(32, 32, device="cuda", dtype=torch.float32) * 0.02, |
| 152 | + "A": torch.randn(8, 32, device="cuda", dtype=torch.float32) * 0.02, |
| 153 | + "B": torch.randn(32, 8, device="cuda", dtype=torch.float32) * 0.02, |
| 154 | + "output": torch.zeros(4, 32, device="cuda", dtype=torch.float32), |
| 155 | + "batch": 4, |
| 156 | + "d_in": 32, |
| 157 | + "d_out": 32, |
| 158 | + "rank": 8, |
| 159 | + "lora_scale": 1.0, |
| 160 | + } |
| 161 | + ) |
| 162 | + |
| 163 | + # Larger realistic: transformer hidden size |
| 164 | + tests.append(self._make_test_case(128, 1024, 1024, 32, lora_scale=0.0625)) |
| 165 | + |
| 166 | + return tests |
| 167 | + |
| 168 | + def generate_performance_test(self) -> Dict[str, Any]: |
| 169 | + torch.manual_seed(0) |
| 170 | + # LLaMA-style: d_in=d_out=4096, rank=64, batch=256 |
| 171 | + return self._make_test_case(256, 4096, 4096, 64, lora_scale=0.015625) |
0 commit comments