Skip to content

Commit 2c33501

Browse files
committed
Add more tunable parameters to vector_add_parallel.py example
1 parent ea13fdb commit 2c33501

1 file changed

Lines changed: 11 additions & 3 deletions

File tree

examples/cuda/vector_add_parallel.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,15 @@
77
def tune():
88
kernel_string = """
99
__global__ void vector_add(float *c, float *a, float *b, int n) {
10-
int i = (blockIdx.x * block_size_x) + threadIdx.x;
11-
if ( i < n ) {
12-
c[i] = a[i] + b[i];
10+
int base = ((blockIdx.x * block_size_x) + threadIdx.x) * elements_per_thread;
11+
12+
#pragma unroll unroll_factor
13+
for (int offset = 0; offset < elements_per_thread; offset++) {
14+
int i = base + offset;
15+
16+
if ( i < n ) {
17+
c[i] = a[i] + b[i];
18+
}
1319
}
1420
}
1521
"""
@@ -25,6 +31,8 @@ def tune():
2531

2632
tune_params = dict()
2733
tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
34+
tune_params["elements_per_thread"] = [1, 2, 3, 4, 5, 6, 7, 8]
35+
tune_params["unroll_factor"] = [1, 2, 3, 4, 5, 6, 7, 8]
2836

2937
results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, parallel=True)
3038
print(env)

0 commit comments

Comments
 (0)