Starlit-Collection/prefix_sum.comp at main · HighlyExistant/Starlit-Collection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// A prefix sum.
//
// This test builds in three configurations. The default is a
// compatibility mode, essentially plain GLSL. With ATOMIC set, the
// flag loads and stores are atomic operations, but uses barriers.
// With both ATOMIC and VKMM set, it uses acquire/release semantics
// instead of barriers.

#version 450

#extension GL_KHR_memory_scope_semantics : enable

#ifdef VKMM
#pragma use_vulkan_memory_model
#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
#else
#define ACQUIRE 0, 0
#define RELEASE 0, 0
#endif

#define N_ROWS 16
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE) // 512
#define PARTITION_SIZE (WG_SIZE * N_ROWS)

layout(local_size_x = WG_SIZE, local_size_y = 1) in;

struct Monoid {
    uint element;
};

layout(set = 0, binding = 0) readonly buffer InBuf {
    Monoid[] inbuf;
};

layout(set = 0, binding = 1) buffer OutBuf {
    Monoid[] outbuf;
};

// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0u
#define FLAG_AGGREGATE_READY 1u
#define FLAG_PREFIX_READY 2u

struct State {
    uint flag;
    Monoid aggregate;
    Monoid prefix;
};

// Perhaps this should be "nonprivate" with VKMM
layout(set = 0, binding = 2) volatile buffer StateBuf {
    uint part_counter;
    uint element_size;
    State[] state;
};
layout(set = 0, binding = 3) volatile buffer DebugBuf {
    uint[] debugval;
};

shared Monoid sh_scratch[WG_SIZE];

Monoid combine_monoid(Monoid a, Monoid b) {
    return Monoid(a.element + b.element);
}

shared uint sh_part_ix;
shared Monoid sh_prefix;
shared uint sh_flag;

void main() {
    Monoid local[N_ROWS];
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
        sh_part_ix = atomicAdd(part_counter, 1);
    }
    barrier();
    uint part_ix = sh_part_ix;

    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    // ***** ADDED *****
    if (ix > element_size) { return; }
    // TODO: gate buffer read? (evaluate whether shader check or
    // CPU-side padding is better)
    debugval[ix] = 0xFFFFFFFF;
    local[0] = inbuf[ix];
    for (uint i = 1; i < N_ROWS; i++) {
        // ***** ADDED *****
        uint ixi = ix + i;
        if (ixi > element_size) { continue; }
        debugval[ix + i] = 0xFFFFFFFF;
        local[i] = combine_monoid(local[i - 1], inbuf[ixi]);
    }
    Monoid agg = local[N_ROWS - 1];
    sh_scratch[gl_LocalInvocationID.x] = agg;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1u << i)) {
            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
            agg = combine_monoid(other, agg);
        }
        barrier();
        sh_scratch[gl_LocalInvocationID.x] = agg;
    }

    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        state[part_ix].aggregate = agg;
        if (part_ix == 0) {
            state[0].prefix = agg;
        }
    }
    // Write flag with release semantics; this is done portably with a barrier.
#ifndef VKMM
    memoryBarrierBuffer();
#endif
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        uint flag = FLAG_AGGREGATE_READY;
        if (part_ix == 0) {
            flag = FLAG_PREFIX_READY;
        }
#ifdef ATOMIC
        atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
#else
        state[part_ix].flag = flag;
#endif
    }

    Monoid exclusive = Monoid(0);
    if (part_ix != 0) {
        // step 4 of paper: decoupled lookback
        uint look_back_ix = part_ix - 1;

        Monoid their_agg;
        uint their_ix = 0;
        while (true) {
            // Read flag with acquire semantics.
            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
#ifdef ATOMIC
                sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
#else
                sh_flag = state[look_back_ix].flag;
#endif
            }
            // The flag load is done only in the last thread. However, because the
            // translation of memoryBarrierBuffer to Metal requires uniform control
            // flow, we broadcast it to all threads.
            barrier();
#ifndef VKMM
            memoryBarrierBuffer();
#endif
            uint flag = sh_flag;
            barrier();

            if (flag == FLAG_PREFIX_READY) {
                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                    Monoid their_prefix = state[look_back_ix].prefix;
                    exclusive = combine_monoid(their_prefix, exclusive);
                }
                break;
            } else if (flag == FLAG_AGGREGATE_READY) {
                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                    their_agg = state[look_back_ix].aggregate;
                    exclusive = combine_monoid(their_agg, exclusive);
                }
                look_back_ix--;
                their_ix = 0;
                continue;
            }
            // else spin
            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                // Unfortunately there's no guarantee of forward progress of other
                // workgroups, so compute a bit of the aggregate before trying again.
                // In the worst case, spinning stops when the aggregate is complete.
                uint lbixpti = look_back_ix * PARTITION_SIZE + their_ix;
                // ***** ADDED *****
                if (lbixpti > element_size) { break; }
                debugval[lbixpti] = 0xFFFFFFFF;
                Monoid m = inbuf[lbixpti];
                if (their_ix == 0) {
                    their_agg = m;
                } else {
                    their_agg = combine_monoid(their_agg, m);
                }
                their_ix++;
                if (their_ix == PARTITION_SIZE) {
                    exclusive = combine_monoid(their_agg, exclusive);
                    if (look_back_ix == 0) {
                        sh_flag = FLAG_PREFIX_READY;
                    } else {
                        look_back_ix--;
                        their_ix = 0;
                    }
                }
            }
            barrier();
            flag = sh_flag;
            barrier();
            if (flag == FLAG_PREFIX_READY) {
                break;
            }
        }
        // step 5 of paper: compute inclusive prefix
        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
            Monoid inclusive_prefix = combine_monoid(exclusive, agg);
            sh_prefix = exclusive;
            state[part_ix].prefix = inclusive_prefix;
        }
#ifndef VKMM
        memoryBarrierBuffer();
#endif
        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
#ifdef ATOMIC
            atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
#else
            state[part_ix].flag = FLAG_PREFIX_READY;
#endif
        }
    }
    barrier();
    if (part_ix != 0) {
        exclusive = sh_prefix;
    }

    Monoid row = exclusive;
    if (gl_LocalInvocationID.x > 0) {
        Monoid other = sh_scratch[gl_LocalInvocationID.x - 1];
        row = combine_monoid(row, other);
    }
    for (uint i = 0; i < N_ROWS; i++) {
        Monoid m = combine_monoid(row, local[i]);
        uint ixi = ix + i;
        // Make sure buffer allocation is padded appropriately.
        if (ixi > element_size) { break; }
        outbuf[ix + i] = m;
    }
}