-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprefix_sum.comp
More file actions
241 lines (219 loc) · 7.59 KB
/
prefix_sum.comp
File metadata and controls
241 lines (219 loc) · 7.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// A prefix sum.
//
// This test builds in three configurations. The default is a
// compatibility mode, essentially plain GLSL. With ATOMIC set, the
// flag loads and stores are atomic operations, but uses barriers.
// With both ATOMIC and VKMM set, it uses acquire/release semantics
// instead of barriers.
#version 450
#extension GL_KHR_memory_scope_semantics : enable
#ifdef VKMM
#pragma use_vulkan_memory_model
#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
#else
#define ACQUIRE 0, 0
#define RELEASE 0, 0
#endif
#define N_ROWS 16
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE) // 512
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
struct Monoid {
uint element;
};
layout(set = 0, binding = 0) readonly buffer InBuf {
Monoid[] inbuf;
};
layout(set = 0, binding = 1) buffer OutBuf {
Monoid[] outbuf;
};
// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0u
#define FLAG_AGGREGATE_READY 1u
#define FLAG_PREFIX_READY 2u
struct State {
uint flag;
Monoid aggregate;
Monoid prefix;
};
// Perhaps this should be "nonprivate" with VKMM
layout(set = 0, binding = 2) volatile buffer StateBuf {
uint part_counter;
uint element_size;
State[] state;
};
layout(set = 0, binding = 3) volatile buffer DebugBuf {
uint[] debugval;
};
shared Monoid sh_scratch[WG_SIZE];
Monoid combine_monoid(Monoid a, Monoid b) {
return Monoid(a.element + b.element);
}
shared uint sh_part_ix;
shared Monoid sh_prefix;
shared uint sh_flag;
void main() {
Monoid local[N_ROWS];
// Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper).
if (gl_LocalInvocationID.x == 0) {
sh_part_ix = atomicAdd(part_counter, 1);
}
barrier();
uint part_ix = sh_part_ix;
uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
// ***** ADDED *****
if (ix > element_size) { return; }
// TODO: gate buffer read? (evaluate whether shader check or
// CPU-side padding is better)
debugval[ix] = 0xFFFFFFFF;
local[0] = inbuf[ix];
for (uint i = 1; i < N_ROWS; i++) {
// ***** ADDED *****
uint ixi = ix + i;
if (ixi > element_size) { continue; }
debugval[ix + i] = 0xFFFFFFFF;
local[i] = combine_monoid(local[i - 1], inbuf[ixi]);
}
Monoid agg = local[N_ROWS - 1];
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
// Publish aggregate for this partition
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
state[part_ix].aggregate = agg;
if (part_ix == 0) {
state[0].prefix = agg;
}
}
// Write flag with release semantics; this is done portably with a barrier.
#ifndef VKMM
memoryBarrierBuffer();
#endif
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
uint flag = FLAG_AGGREGATE_READY;
if (part_ix == 0) {
flag = FLAG_PREFIX_READY;
}
#ifdef ATOMIC
atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
#else
state[part_ix].flag = flag;
#endif
}
Monoid exclusive = Monoid(0);
if (part_ix != 0) {
// step 4 of paper: decoupled lookback
uint look_back_ix = part_ix - 1;
Monoid their_agg;
uint their_ix = 0;
while (true) {
// Read flag with acquire semantics.
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
#ifdef ATOMIC
sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
#else
sh_flag = state[look_back_ix].flag;
#endif
}
// The flag load is done only in the last thread. However, because the
// translation of memoryBarrierBuffer to Metal requires uniform control
// flow, we broadcast it to all threads.
barrier();
#ifndef VKMM
memoryBarrierBuffer();
#endif
uint flag = sh_flag;
barrier();
if (flag == FLAG_PREFIX_READY) {
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
Monoid their_prefix = state[look_back_ix].prefix;
exclusive = combine_monoid(their_prefix, exclusive);
}
break;
} else if (flag == FLAG_AGGREGATE_READY) {
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
their_agg = state[look_back_ix].aggregate;
exclusive = combine_monoid(their_agg, exclusive);
}
look_back_ix--;
their_ix = 0;
continue;
}
// else spin
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
// Unfortunately there's no guarantee of forward progress of other
// workgroups, so compute a bit of the aggregate before trying again.
// In the worst case, spinning stops when the aggregate is complete.
uint lbixpti = look_back_ix * PARTITION_SIZE + their_ix;
// ***** ADDED *****
if (lbixpti > element_size) { break; }
debugval[lbixpti] = 0xFFFFFFFF;
Monoid m = inbuf[lbixpti];
if (their_ix == 0) {
their_agg = m;
} else {
their_agg = combine_monoid(their_agg, m);
}
their_ix++;
if (their_ix == PARTITION_SIZE) {
exclusive = combine_monoid(their_agg, exclusive);
if (look_back_ix == 0) {
sh_flag = FLAG_PREFIX_READY;
} else {
look_back_ix--;
their_ix = 0;
}
}
}
barrier();
flag = sh_flag;
barrier();
if (flag == FLAG_PREFIX_READY) {
break;
}
}
// step 5 of paper: compute inclusive prefix
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
Monoid inclusive_prefix = combine_monoid(exclusive, agg);
sh_prefix = exclusive;
state[part_ix].prefix = inclusive_prefix;
}
#ifndef VKMM
memoryBarrierBuffer();
#endif
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
#ifdef ATOMIC
atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
#else
state[part_ix].flag = FLAG_PREFIX_READY;
#endif
}
}
barrier();
if (part_ix != 0) {
exclusive = sh_prefix;
}
Monoid row = exclusive;
if (gl_LocalInvocationID.x > 0) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - 1];
row = combine_monoid(row, other);
}
for (uint i = 0; i < N_ROWS; i++) {
Monoid m = combine_monoid(row, local[i]);
uint ixi = ix + i;
// Make sure buffer allocation is padded appropriately.
if (ixi > element_size) { break; }
outbuf[ix + i] = m;
}
}