PufferLib/tests/puff_advantage_standalone.cu at b33117cecf683e27813fd41c2e76e9fde97a0080 · PufferAI/PufferLib · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/*
Build:
nvcc -O3 -arch=sm_86 -shared -Xcompiler -fPIC puff_advantage_standalone.cu -o libpuff_advantage.so
*/

#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>

// =============================================================================
// ORIGINAL IMPLEMENTATION
// =============================================================================

__device__ void puff_advantage_row_original(float* values, float* rewards, float* dones,
        float* importance, float* advantages, float gamma, float lambda,
        float rho_clip, float c_clip, int horizon) {
    float lastpufferlam = 0;
    for (int t = horizon-2; t >= 0; t--) {
        int t_next = t + 1;
        float nextnonterminal = 1.0f - dones[t_next];
        float rho_t = fminf(importance[t], rho_clip);
        float c_t = fminf(importance[t], c_clip);
        float delta = rho_t*(rewards[t_next] + gamma*values[t_next]*nextnonterminal - values[t]);
        lastpufferlam = delta + gamma*lambda*c_t*lastpufferlam*nextnonterminal;
        advantages[t] = lastpufferlam;
    }
}

__global__ void puff_advantage_kernel_original(float* values, float* rewards,
        float* dones, float* importance, float* advantages, float gamma,
        float lambda, float rho_clip, float c_clip, int num_steps, int horizon) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= num_steps) {
        return;
    }
    int offset = row * horizon;
    puff_advantage_row_original(values + offset, rewards + offset, dones + offset,
        importance + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon);
}

// =============================================================================
// NEW IMPLEMENTATION
// =============================================================================

__device__ __forceinline__ void puff_advantage_row_new(float* values, float* rewards, float* dones,
        float* importance, float* advantages, float gamma, float lambda,
        float rho_clip, float c_clip, int horizon) {

    // Fall back to original if horizon not divisible by 4
    if (horizon % 4 != 0) {
        puff_advantage_row_original(values, rewards, dones,
            importance, advantages, gamma, lambda, rho_clip, c_clip, horizon);
        return;
    }

    float lastpufferlam = 0.0f;
    int num_chunks = horizon / 4;

    // need to track values across chunks
    float next_value = values[horizon - 1];
    float next_done = dones[horizon - 1];
    float next_reward = rewards[horizon - 1];

    // Process chunks from end to beginning
    for (int chunk = num_chunks - 1; chunk >= 0; chunk--) {
        int base = chunk * 4;

        // Load 4 elements at once
        float4 v4 = *reinterpret_cast<float4*>(values + base);
        float4 r4 = *reinterpret_cast<float4*>(rewards + base);
        float4 d4 = *reinterpret_cast<float4*>(dones + base);
        float4 i4 = *reinterpret_cast<float4*>(importance + base);

        float v[4] = {v4.x, v4.y, v4.z, v4.w};
        float r[4] = {r4.x, r4.y, r4.z, r4.w};
        float d[4] = {d4.x, d4.y, d4.z, d4.w};
        float imp[4] = {i4.x, i4.y, i4.z, i4.w};
        float adv[4] = {0.0f, 0.0f, 0.0f, 0.0f};

        int start_idx = (chunk == num_chunks - 1) ? 2 : 3;

        #pragma unroll
        for (int i = start_idx; i >= 0; i--) {
            float nextnonterminal = 1.0f - next_done;
            float rho_t = fminf(imp[i], rho_clip);
            float c_t = fminf(imp[i], c_clip);
            float delta = rho_t * (next_reward + gamma * next_value * nextnonterminal - v[i]);
            lastpufferlam = delta + gamma * lambda * c_t * lastpufferlam * nextnonterminal;
            adv[i] = lastpufferlam;

            next_value = v[i];
            next_done = d[i];
            next_reward = r[i];
        }

        float4 adv4 = make_float4(adv[0], adv[1], adv[2], adv[3]);
        *reinterpret_cast<float4*>(advantages + base) = adv4;
    }
}

__global__ void puff_advantage_kernel_new(float* values, float* rewards,
        float* dones, float* importance, float* advantages, float gamma,
        float lambda, float rho_clip, float c_clip, int num_steps, int horizon) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= num_steps) {
        return;
    }
    int offset = row * horizon;
    puff_advantage_row_new(values + offset, rewards + offset, dones + offset,
        importance + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon);
}

// =============================================================================
// C API - Exported functions callable from Python via ctypes
// =============================================================================

extern "C" {

void launch_original(float* values, float* rewards, float* dones,
        float* importance, float* advantages, float gamma, float lambda,
        float rho_clip, float c_clip, int num_steps, int horizon) {
    int threads_per_block = 256;
    int blocks = (num_steps + threads_per_block - 1) / threads_per_block;
    puff_advantage_kernel_original<<<blocks, threads_per_block>>>(
        values, rewards, dones, importance, advantages,
        gamma, lambda, rho_clip, c_clip, num_steps, horizon);
}

void launch_new(float* values, float* rewards, float* dones,
        float* importance, float* advantages, float gamma, float lambda,
        float rho_clip, float c_clip, int num_steps, int horizon) {
    int threads_per_block = 32;
    int blocks = (num_steps + threads_per_block - 1) / threads_per_block;
    puff_advantage_kernel_new<<<blocks, threads_per_block>>>(
        values, rewards, dones, importance, advantages,
        gamma, lambda, rho_clip, c_clip, num_steps, horizon);
}

void sync_device() {
    cudaDeviceSynchronize();
}

const char* get_last_error() {
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        return cudaGetErrorString(err);
    }
    return NULL;
}

}  // extern "C"