Skip to content

Commit 193db17

Browse files
committed
Switched to using unified/managed memory
1 parent c7000de commit 193db17

2 files changed

Lines changed: 65 additions & 130 deletions

File tree

examples/basic/normal.cu

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,22 @@ int main(){
3434

3535
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
3636

37-
CudaMemAccessStorage<int> memAccessStorage(10000);
37+
// The overloaded new operator generates a managed memory object
38+
CudaMemAccessStorage<int>* memAccessStorage = new CudaMemAccessStorage<int>(10000);
3839

39-
40-
CudaMemAccessLogger<int> input(d_input, prob_size, "Input Datastructure", &memAccessStorage);
41-
CudaMemAccessLogger<int> output(d_output, prob_size, "Output Datastructure", &memAccessStorage);
40+
// The overloaded new operator generates a managed memory object
41+
CudaMemAccessLogger<int>* input = new CudaMemAccessLogger<int>(d_input, prob_size, "Input Datastructure", memAccessStorage);
42+
CudaMemAccessLogger<int>* output = new CudaMemAccessLogger<int>(d_output, prob_size, "Output Datastructure", memAccessStorage);
4243

4344
constexpr int threads = 32;
4445
constexpr int blocks = (prob_size/threads)+1;
4546

46-
kernel<<<blocks, threads>>>(prob_size, input.getDevicePointer(), output.getDevicePointer());
47+
//kernel<<<blocks, threads>>>(prob_size, input.getDevicePointer(), output.getDevicePointer());
48+
kernel << <blocks, threads >> > (prob_size, input, output);
4749
checkCudaError(cudaGetLastError());
4850
cudaDeviceSynchronize();
4951

50-
memAccessStorage.generateOutput("../../../html/template.json", "../../../out/basic.json", CudaMemAccessStorage<int>::parseDataForJSPage);
52+
memAccessStorage->generateOutput("../../../html/template.json", "../../../out/basic.json", CudaMemAccessStorage<int>::parseDataForJSPage);
5153

5254
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
5355

@@ -61,6 +63,11 @@ int main(){
6163
checkCudaError(cudaFree(d_input));
6264
checkCudaError(cudaFree(d_output));
6365

66+
// Free up the managed memory objects
67+
delete memAccessStorage;
68+
delete input;
69+
delete output;
70+
6471
std::cout << "kernel finished successful" << std::endl;
6572
return 0;
6673
}

include/cuda_mav.cuh

Lines changed: 52 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,27 @@
88
#include <vector>
99
#include <iostream>
1010

11+
// Base class for creating managed memory objects
12+
// source: https://developer.nvidia.com/blog/unified-memory-in-cuda-6/
13+
class Managed
14+
{
15+
public:
16+
void* operator new(size_t len) {
17+
void* ptr;
18+
cudaMallocManaged(&ptr, len);
19+
cudaDeviceSynchronize();
20+
return ptr;
21+
}
22+
23+
void operator delete(void* ptr) {
24+
cudaDeviceSynchronize();
25+
cudaFree(ptr);
26+
}
27+
};
28+
1129
// Define a custom template class, which takes care of storing the memory accesses of a given type.
1230
template<typename T>
13-
class CudaMemAccessStorage {
31+
class CudaMemAccessStorage : public Managed{
1432

1533
struct GlobalSettings {
1634
int gridDimX;
@@ -77,14 +95,10 @@ private:
7795

7896
// Have a array of ints on the device used to store grid dimensions, and block dimensions, and the warp size
7997
// This array will always have 10 elements (x,y,z) for grid dimensions, (x,y,z) for block dimensions, and 1 for warp size, 1 for status messages, 1 for the original log size, 1 for the current log size
80-
GlobalSettings *d_constantData = nullptr;
81-
// Have the same array on the host
82-
GlobalSettings *h_constantData = nullptr;
98+
GlobalSettings constantData;
8399

84100
// Have a pointer to a list of memory access logs
85-
MemoryAccessLog *d_memoryAccessLog = nullptr;
86-
// Have a pointer to a list of memory access logs on the host
87-
MemoryAccessLog *h_memoryAccessLog = nullptr;
101+
MemoryAccessLog* memoryAccessLog = nullptr;
88102

89103
// Store the memory regions by storing the starting address, the amount of elements, the size of a single element and a name
90104
std::vector<std::tuple<T*, size_t, size_t, std::string>> memoryRegions;
@@ -256,95 +270,44 @@ public:
256270
}
257271

258272
private:
259-
// Define function to load back data
260-
void fetchData() {
261-
// First fetch the h_constantData from the device
262-
checkCudaError(cudaMemcpy(h_constantData, d_constantData, sizeof(GlobalSettings), cudaMemcpyDeviceToHost),
263-
"Could not copy constant data from device.");
264-
265-
// Copy the data back from the device
266-
checkCudaError(
267-
cudaMemcpy(h_memoryAccessLog, d_memoryAccessLog, sizeof(MemoryAccessLog) * h_constantData->originalSize,
268-
cudaMemcpyDeviceToHost), "Could not copy memory access logs from device.");
269-
}
270273

271274
__device__ int getStorageIndex() {// Atomically increase the currentSize by 1
272-
int current_index = atomicAdd(&d_constantData->currentSize, 1);
275+
int current_index = atomicAdd(&constantData.currentSize, 1);
273276

274277
// First check if the currentSize is zero, if so we need to initialize the additional data variables, needed later to restore the data
275278
if (current_index == 0) {
276279
// Store the grid dimensions
277-
d_constantData->gridDimX = gridDim.x;
278-
d_constantData->gridDimY = gridDim.y;
279-
d_constantData->gridDimZ = gridDim.z;
280+
constantData.gridDimY = gridDim.y;
281+
constantData.gridDimZ = gridDim.z;
282+
constantData.gridDimX = gridDim.x;
280283
// Store the block dimensions
281-
d_constantData->blockDimX = blockDim.x;
282-
d_constantData->blockDimY = blockDim.y;
283-
d_constantData->blockDimZ = blockDim.z;
284+
constantData.blockDimX = blockDim.x;
285+
constantData.blockDimY = blockDim.y;
286+
constantData.blockDimZ = blockDim.z;
284287
// Store the warp size
285-
d_constantData->warpSize = warpSize;
288+
constantData.warpSize = warpSize;
286289
}
287290
return current_index;
288291
}
289292

290-
CudaMemAccessStorage<T> *d_this;
291-
292293
public:
293294

294-
__device__ CudaMemAccessStorage() {
295-
// Set the default values
296-
d_memoryAccessLog = nullptr;
297-
}
298-
299295
// Constructor which allocates the memory on the device
300296
__host__ CudaMemAccessStorage(unsigned int size) {
301297

302-
h_constantData = new GlobalSettings{-1, -1, -1, -1, -1, -1, -1, size, 0};
298+
constantData = {-1, -1, -1, -1, -1, -1, -1, size, 0};
303299

304-
// Allocate the memory on the device for the d_constantData and check if it was successful
305-
checkCudaError(cudaMalloc(&d_constantData, sizeof(GlobalSettings)),
306-
"Could not allocate array to store kernel data on device.");
307-
// Copy over the host data to the device
308-
checkCudaError(cudaMemcpy(d_constantData, h_constantData, sizeof(GlobalSettings), cudaMemcpyHostToDevice),
300+
// Allocate the memory for the memeoryAccessLog and check if it was successful
301+
checkCudaError(cudaMallocManaged(&memoryAccessLog, sizeof(MemoryAccessLog) * size),
309302
"Could not copy constant data to device.");
310-
311-
// Allocate the memory on the device for the d_memoryAccessLog and check if it was successful
312-
checkCudaError(cudaMalloc(&d_memoryAccessLog, sizeof(MemoryAccessLog) * size),
313-
"Could not allocate array to store memory access logs on device. (reading)");
314-
// Also allocate the memory on the host for the h_memoryAccessLog
315-
h_memoryAccessLog = new MemoryAccessLog[size];
316-
317-
// Copy the empty data to the device
318-
checkCudaError(cudaMemcpy(d_memoryAccessLog, h_memoryAccessLog, sizeof(MemoryAccessLog) * size,
319-
cudaMemcpyHostToDevice), "Could not copy memory access logs to device.");
320-
321-
// Now we finished initializing the class, so we need to create the copy of this class on the device
322-
// Allocate the memory on the device for the d_this and check if it was successful
323-
checkCudaError(cudaMalloc(&d_this, sizeof(CudaMemAccessStorage<T>)),
324-
"Could not allocate array to store this class on device.");
325-
// Copy the empty data to the device
326-
checkCudaError(cudaMemcpy(d_this, this, sizeof(CudaMemAccessStorage<T>), cudaMemcpyHostToDevice),
327-
"Could not copy this class to device.");
328-
329303
}
330304

331305
__host__ void registerArray(T *array, size_t size, std::string name = "") {
332306
memoryRegions.push_back(std::make_tuple(array, size, sizeof(array[0]), name));
333307
}
334308

335309
__host__ ~CudaMemAccessStorage() {
336-
// Free the memory on the device
337-
checkCudaError(cudaFree(d_memoryAccessLog), "Could not free memory access logs on device.");
338-
checkCudaError(cudaFree(d_constantData), "Could not free constant data on device.");
339-
checkCudaError(cudaFree(d_this), "Could not free this class on device.");
340-
341-
// Free the memory on the host
342-
delete[] h_memoryAccessLog;
343-
delete h_constantData;
344-
}
345-
346-
__host__ __device__ CudaMemAccessStorage<T> *getDevicePointer() {
347-
return d_this;
310+
checkCudaError(cudaFree(memoryAccessLog), "Could not free memory access logs.");
348311
}
349312

350313
__device__ void pushReadLog(T *address) {
@@ -355,9 +318,9 @@ public:
355318
unsigned int threadId = threadIdx.x + threadIdx.y * blockDim.x + blockDim.x * blockDim.y * threadIdx.z;
356319

357320
// Check that our current index is less than the original size
358-
if (current_index < d_constantData->originalSize) {
321+
if (current_index < constantData.originalSize) {
359322
// Store the data in the memory access log
360-
d_memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId, true);
323+
memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId, true);
361324
}
362325
}
363326

@@ -369,9 +332,9 @@ public:
369332
unsigned int threadId = threadIdx.x + threadIdx.y * blockDim.x + blockDim.x * blockDim.y * threadIdx.z;
370333

371334
// Check that our current index is less than the original size
372-
if (current_index < d_constantData->originalSize) {
335+
if (current_index < constantData.originalSize) {
373336
// Store the data in the memory access log
374-
d_memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId, false);
337+
memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId, false);
375338
}
376339
}
377340

@@ -380,8 +343,6 @@ public:
380343
void generateOutput(const std::string template_file, const std::string &output_path,
381344
std::function<std::tuple<std::string, std::string>(GlobalSettings settingsStruct, std::vector<std::tuple<T*, size_t, size_t, std::string>> memoryRegions,
382345
std::vector<MemoryAccessLog> accessLogs)> customGenerationFunction = nullptr) {
383-
// Fetch the data back from the device
384-
fetchData();
385346

386347
// Data processing code here
387348

@@ -402,18 +363,18 @@ public:
402363
std::vector<MemoryAccessLog> accessLogs;
403364

404365
// Loop over the read logs
405-
for (int i = 0; i < h_constantData->currentSize; i++) {
366+
for (int i = 0; i < constantData.currentSize; i++) {
406367
// Add the log to the vector
407-
accessLogs.push_back(h_memoryAccessLog[i]);
368+
accessLogs.push_back(memoryAccessLog[i]);
408369
}
409370

410371
// Run the custom generation function, if one was passed
411372
if (customGenerationFunction != nullptr) {
412-
placeholderReplacement = customGenerationFunction(*h_constantData, memoryRegions, accessLogs);
373+
placeholderReplacement = customGenerationFunction(constantData, memoryRegions, accessLogs);
413374
}
414375
// If none was passed, use a default function
415376
else {
416-
placeholderReplacement = parseDataForStaticHTML(*h_constantData, memoryRegions, accessLogs);
377+
placeholderReplacement = parseDataForStaticHTML(constantData, memoryRegions, accessLogs);
417378
}
418379

419380
// Replace "<!-- HTML_TEMPLATE -->" with the HTML template
@@ -445,31 +406,23 @@ public:
445406
}
446407

447408
__host__ const GlobalSettings getGlobalSettings() {
448-
// Fetch the data back from the device
449-
fetchData();
450-
451-
return *h_constantData;
409+
return constantData;
452410
}
453411

454412
};
455413

456414

457415
// Define a custom template class which holds the data for the CUDA kernel
458416
template<typename T>
459-
class CudaMemAccessLogger {
417+
class CudaMemAccessLogger : public Managed {
460418

461419
private:
462420

463421
// Have an internal pointer to the data, this pointer is a device pointer
464422
T *d_data;
465423

466424
// Have a pointer to the storage class we are using
467-
CudaMemAccessStorage<T> *h_storage;
468-
// We also need to store its device pointer
469-
CudaMemAccessStorage<T> *d_storage;
470-
471-
// Also have an instance of this class allocated on the device
472-
CudaMemAccessLogger<T> *d_this;
425+
CudaMemAccessStorage<T> *storage;
473426

474427
// Implement a proxy class so we can both read and write from the array when accessing the array operator
475428
class AccessProxy {
@@ -524,37 +477,23 @@ public:
524477
}
525478

526479
// Constructor which allocates the memory on the device
527-
__host__ CudaMemAccessLogger(T *array_data, size_t array_length, std::string description_name, CudaMemAccessStorage<T> *storage = nullptr) {
480+
__host__ CudaMemAccessLogger(T *array_data, size_t array_length, std::string description_name, CudaMemAccessStorage<T> *cma_storage = nullptr) {
528481

529482
// Store the passed data pointer within the class
530483
d_data = array_data;
531484

532485
// If the storage class is not null, we need to store it
533-
if (storage != nullptr) {
486+
if (cma_storage != nullptr) {
534487
// Store the storage class
535-
h_storage = storage;
488+
storage = cma_storage;
536489
}
537490
// If it is null, we need to create a new storage class
538491
else {
539492
// Create a new storage class
540-
h_storage = new CudaMemAccessStorage<T>(10000);
493+
storage = new CudaMemAccessStorage<T>(10000);
541494
}
542495

543-
// Also store the device pointer
544-
d_storage = h_storage->getDevicePointer();
545-
546-
h_storage->registerArray(array_data, array_length, description_name);
547-
548-
549-
550-
// We need to create the copy of this class on the device
551-
// Allocate the memory on the device for the d_this and check if it was successful
552-
checkCudaError(cudaMalloc(&d_this, sizeof(CudaMemAccessLogger<T>)),
553-
"Could not allocate array to store this class on device.");
554-
// Copy the empty data to the device
555-
checkCudaError(cudaMemcpy(d_this, this, sizeof(CudaMemAccessLogger<T>), cudaMemcpyHostToDevice),
556-
"Could not copy this class to device.");
557-
496+
storage->registerArray(array_data, array_length, description_name);
558497
}
559498

560499
__device__ T get(unsigned int index) {
@@ -563,7 +502,7 @@ public:
563502
T *address = &d_data[index];
564503

565504
// Push the read log
566-
d_storage->pushReadLog(address);
505+
storage->pushReadLog(address);
567506

568507
// Print the accessed data
569508
//printf("Accessed data at address %p by thread %d in block %d\n", &d_data[index], threadId, blockId);
@@ -577,7 +516,7 @@ public:
577516
T *address = &d_data[index];
578517

579518
// Push the write log
580-
d_storage->pushWriteLog(address);
519+
storage->pushWriteLog(address);
581520

582521
// Write the value to the data
583522
d_data[index] = value;
@@ -586,13 +525,6 @@ public:
586525
//printf("Wrote data at address %p by thread %d in block %d\n", &d_data[index], threadId, blockId);
587526
}
588527

589-
590-
// Destructor to free the memory on the device
591-
__host__ ~CudaMemAccessLogger() {
592-
// Free up the memory on the device
593-
checkCudaError(cudaFree(d_this), "Could not free class instance pointer on device.");
594-
}
595-
596528
// Array operator overload on the device
597529
__device__ AccessProxy operator[](size_t i) {
598530
return AccessProxy(this, i);
@@ -610,11 +542,7 @@ public:
610542
#endif
611543
}
612544

613-
__host__ CudaMemAccessLogger<T> *getDevicePointer() {
614-
return d_this;
615-
}
616-
617545
CudaMemAccessStorage<T>* getStorage() {
618-
return h_storage;
546+
return storage;
619547
}
620548
};

0 commit comments

Comments
 (0)