|
| 1 | +{ |
| 2 | + "language": "code_cuda", |
| 3 | + "groups": [ |
| 4 | + [0, 100], |
| 5 | + [101, 300], |
| 6 | + [301, 600], |
| 7 | + [601, 9999] |
| 8 | + ], |
| 9 | + "quotes": [ |
| 10 | + { |
| 11 | + "text": "// 32 bit Murmur3 hash\n__device__ uint32_t hash(uint32_t k)\n{\n\tk ^= k >> 16;\n\tk *= 0x85ebca6b;\n\tk ^= k >> 13;\n\tk *= 0xc2b2ae35;\n\tk ^= k >> 16;\n\treturn k & (kHashTableCapacity-1);}", |
| 12 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 13 | + "length": 179, |
| 14 | + "id": 1 |
| 15 | + }, |
| 16 | + { |
| 17 | + "text": "// Create a hash table. For linear probing, this is just an array of KeyValues\nKeyValue* create_hashtable() \n{\n\t// Allocate memory\n\tKeyValue* hashtable;\n\tcudaMalloc(&hashtable, sizeof(KeyValue) * kHashTableCapacity);\n\n\t// Initialize hash table to empty\n\tstatic_assert(kEmpty == 0xffffffff, \"memset expected kEmpty=0xffffffff\");\n\tcudaMemset(hashtable, 0xff, sizeof(KeyValue) * kHashTableCapacity);\n\n\treturn hashtable;\n}", |
| 18 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 19 | + "length": 418, |
| 20 | + "id": 2 |
| 21 | + }, |
| 22 | + { |
| 23 | + "text": "// Insert the key/values in kvs into the hashtable\n__global__ void gpu_hashtable_insert(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs)\n{\n\tunsigned int threadid = blockIdx.x*blockDim.x + threadIdx.x;\n\tif (threadid < numkvs)\n\t{\n\t\tuint32_t key = kvs[threadid].key;\n\t\tuint32_t value = kvs[threadid].value;\n\t\tuint32_t slot = hash(key);\n\n\t\twhile (true)\n\t\t{\n\t\t\tuint32_t prev = atomicCAS(&hashtable[slot].key, kEmpty, key);\n\t\t\tif (prev == kEmpty || prev == key)\n\t\t\t{\n\t\t\t\thashtable[slot].value = value;\n\t\t\t\treturn;\n\t\t\t}\n\n\t\t\tslot = (slot + 1) & (kHashTableCapacity-1);\n\t\t}\n\t}\n}", |
| 24 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 25 | + "length": 583, |
| 26 | + "id": 3 |
| 27 | + }, |
| 28 | + { |
| 29 | + "text": "void insert_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs)\n{\n\t// Copy the keyvalues to the GPU\n\tKeyValue* device_kvs;\n\tcudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);\n\tcudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);\n\n\t// Have CUDA calculate the thread block size\n\tint mingridsize;\n\tint threadblocksize;\n\tcudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);\n\n\t// Create events for GPU timing\n\tcudaEvent_t start, stop;\n\tcudaEventCreate(&start);\n\tcudaEventCreate(&stop);\n\n\tcudaEventRecord(start);\n\n\t// Insert all the keys into the hash table\n\tint gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;\n\tgpu_hashtable_insert<<<gridsize, threadblocksize>>>(pHashTable, device_kvs, (uint32_t)num_kvs);\n\n\tcudaEventRecord(stop);\n\n\tcudaEventSynchronize(stop);\n\n\tfloat milliseconds = 0;\n\tcudaEventElapsedTime(&milliseconds, start, stop);\n\tfloat seconds = milliseconds / 1000.0f;\n\tprintf(\"\tGPU inserted %d items in %f ms (%f million keys/second)\n\", \n\t\tnum_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);\n\n\tcudaFree(device_kvs);\n}", |
| 30 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 31 | + "length": 1152, |
| 32 | + "id": 4 |
| 33 | + }, |
| 34 | + { |
| 35 | + "text": "// Lookup keys in the hashtable, and return the values\n__global__ void gpu_hashtable_lookup(KeyValue* hashtable, KeyValue* kvs, unsigned int numkvs)\n{\n\tunsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (threadid < numkvs)\n\t{\n\t\tuint32_t key = kvs[threadid].key;\n\t\tuint32_t slot = hash(key);\n\n\t\twhile (true)\n\t\t{\n\t\t\tif (hashtable[slot].key == key)\n\t\t\t{\n\t\t\t\tkvs[threadid].value = hashtable[slot].value;\n\t\t\t\treturn;\n\t\t\t}\n\t\t\tif (hashtable[slot].key == kEmpty)\n\t\t\t{\n\t\t\t\tkvs[threadid].value = kEmpty;\n\t\t\t\treturn;\n\t\t\t}\n\t\t\tslot = (slot + 1) & (kHashTableCapacity - 1);\n\t\t}\n\t}\n}", |
| 36 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 37 | + "length": 584, |
| 38 | + "id": 5 |
| 39 | + }, |
| 40 | + { |
| 41 | + "text": "void lookup_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t num_kvs)\n{\n\t// Copy the keyvalues to the GPU\n\tKeyValue* device_kvs;\n\tcudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);\n\tcudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);\n\n\t// Have CUDA calculate the thread block size\n\tint mingridsize;\n\tint threadblocksize;\n\tcudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);\n\n\t// Create events for GPU timing\n\tcudaEvent_t start, stop;\n\tcudaEventCreate(&start);\n\tcudaEventCreate(&stop);\n\n\tcudaEventRecord(start);\n\n\t// Insert all the keys into the hash table\n\tint gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;\n\tgpu_hashtable_lookup << <gridsize, threadblocksize >> > (pHashTable, device_kvs, (uint32_t)num_kvs);\n\n\tcudaEventRecord(stop);\n\n\tcudaEventSynchronize(stop);\n\n\tfloat milliseconds = 0;\n\tcudaEventElapsedTime(&milliseconds, start, stop);\n\tfloat seconds = milliseconds / 1000.0f;\n\tprintf(\"\tGPU lookup %d items in %f ms (%f million keys/second)\n\",\n\t\tnum_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);\n\n\tcudaFree(device_kvs);\n}", |
| 42 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 43 | + "length": 1148, |
| 44 | + "id": 6 |
| 45 | + }, |
| 46 | + { |
| 47 | + "text": "// Delete each key in kvs from the hash table, if the key exists\n// A deleted key is left in the hash table, but its value is set to kEmpty\n// Deleted keys are not reused; once a key is assigned a slot, it never moves\n__global__ void gpu_hashtable_delete(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs)\n{\n\tunsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (threadid < numkvs)\n\t{\n\t\tuint32_t key = kvs[threadid].key;\n\t\tuint32_t slot = hash(key);\n\n\t\twhile (true)\n\t\t{\n\t\t\tif (hashtable[slot].key == key)\n\t\t\t{\n\t\t\t\thashtable[slot].value = kEmpty;\n\t\t\t\treturn;\n\t\t\t}\n\t\t\tif (hashtable[slot].key == kEmpty)\n\t\t\t{\n\t\t\t\treturn;\n\t\t\t}\n\t\t\tslot = (slot + 1) & (kHashTableCapacity - 1);\n\t\t}\n\t}\n}", |
| 48 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 49 | + "length": 706, |
| 50 | + "id": 7 |
| 51 | + }, |
| 52 | + { |
| 53 | + "text": "void delete_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs)\n{\n\t// Copy the keyvalues to the GPU\n\tKeyValue* device_kvs;\n\tcudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs);\n\tcudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice);\n\n\t// Have CUDA calculate the thread block size\n\tint mingridsize;\n\tint threadblocksize;\n\tcudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0);\n\n\t// Create events for GPU timing\n\tcudaEvent_t start, stop;\n\tcudaEventCreate(&start);\n\tcudaEventCreate(&stop);\n\n\tcudaEventRecord(start);\n\n\t// Insert all the keys into the hash table\n\tint gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize;\n\tgpu_hashtable_delete<< <gridsize, threadblocksize >> > (pHashTable, device_kvs, (uint32_t)num_kvs);\n\n\tcudaEventRecord(stop);\n\n\tcudaEventSynchronize(stop);\n\n\tfloat milliseconds = 0;\n\tcudaEventElapsedTime(&milliseconds, start, stop);\n\tfloat seconds = milliseconds / 1000.0f;\n\tprintf(\"\tGPU delete %d items in %f ms (%f million keys/second)\n\",\n\t\tnum_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f);\n\n\tcudaFree(device_kvs);\n}", |
| 54 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 55 | + "length": 1153, |
| 56 | + "id": 8 |
| 57 | + }, |
| 58 | + { |
| 59 | + "text": "// Iterate over every item in the hashtable; return non-empty key/values\n__global__ void gpu_iterate_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t* kvs_size)\n{\n\tunsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (threadid < kHashTableCapacity) \n\t{\n\t\tif (pHashTable[threadid].key != kEmpty) \n\t\t{\n\t\t\tuint32_t value = pHashTable[threadid].value;\n\t\t\tif (value != kEmpty)\n\t\t\t{\n\t\t\t\tuint32_t size = atomicAdd(kvs_size, 1);\n\t\t\t\tkvs[size] = pHashTable[threadid];\n\t\t\t}\n\t\t}\n\t}\n}", |
| 60 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 61 | + "length": 493, |
| 62 | + "id": 9 |
| 63 | + }, |
| 64 | + { |
| 65 | + "text": "std::vector<KeyValue> iterate_hashtable(KeyValue* pHashTable)\n{\n\tuint32_t* device_num_kvs;\n\tcudaMalloc(&device_num_kvs, sizeof(uint32_t));\n\tcudaMemset(device_num_kvs, 0, sizeof(uint32_t));\n\n\tKeyValue* device_kvs;\n\tcudaMalloc(&device_kvs, sizeof(KeyValue) * kNumKeyValues);\n\n\tint mingridsize;\n\tint threadblocksize;\n\tcudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_iterate_hashtable, 0, 0);\n\n\tint gridsize = (kHashTableCapacity + threadblocksize - 1) / threadblocksize;\n\tgpu_iterate_hashtable<<<gridsize, threadblocksize>>>(pHashTable, device_kvs, device_num_kvs);\n\n\tuint32_t num_kvs;\n\tcudaMemcpy(&num_kvs, device_num_kvs, sizeof(uint32_t), cudaMemcpyDeviceToHost);\n\n\tstd::vector<KeyValue> kvs;\n\tkvs.resize(num_kvs);\n\n\tcudaMemcpy(kvs.data(), device_kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyDeviceToHost);\n\n\tcudaFree(device_kvs);\n\tcudaFree(device_num_kvs);\n\n\treturn kvs;\n}", |
| 66 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 67 | + "length": 894, |
| 68 | + "id": 10 |
| 69 | + }, |
| 70 | + { |
| 71 | + "text": "// Free the memory of the hashtable\nvoid destroy_hashtable(KeyValue* pHashTable)\n{\n\tcudaFree(pHashTable);\n}", |
| 72 | + "source": "SimpleGPUHashTable - linearprobing.cu", |
| 73 | + "length": 107, |
| 74 | + "id": 11 |
| 75 | + } |
| 76 | + ] |
| 77 | +} |
0 commit comments