Skip to content

Commit a824bc1

Browse files
v1.64.00 (#198)
* Added BLOCKSIZES to a2asweep preset to allow sweeping over threadblock sizes * Fixing src initialization when using BYTE_OFFSET * Adding FILL_COMPRESS functionality to allow for different input data patterns * Updating CHANGELOG regarding GFX_BLOCKSIZE limit increase to 1024
1 parent fb012cf commit a824bc1

6 files changed

Lines changed: 234 additions & 69 deletions

File tree

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,27 @@
33
Documentation for TransferBench is available at
44
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
55

6+
## v1.64.00
7+
### Added
8+
- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
9+
- Added FILL_COMPRESS to allow more control over input data pattern
10+
- FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
11+
that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
12+
- Bins:
13+
- 0 - random
14+
- 1 - 1B0 upper 1 byte of each aligned 2 bytes is 0
15+
- 2 - 2B0 upper 2 bytes of each aligned 4 bytes is 0
16+
- 3 - 4B0 upper 4 bytes of each aligned 8 bytes is 0
17+
- 4 - 32B0 upper 32 bytes of each aligned 64-byte line are 0
18+
- FILL_PATTERN will be ignored if FILL_COMPRESS is specified
19+
- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
20+
set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
21+
### Modified
22+
- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
23+
24+
### Fixed
25+
- Fixed bug when using BYTE_OFFSET
26+
627
## v1.63.00
728
### Added
829
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
99
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
1010
endif()
1111

12-
set(VERSION_STRING "1.63.00")
12+
set(VERSION_STRING "1.64.00")
1313
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
1414

1515
## Load CMake modules

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
8383

8484
clean:
8585
rm -f ./TransferBench ./TransferBenchCuda
86-

src/client/EnvVars.hpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ class EnvVars
7777
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
7878
int byteOffset; // Byte-offset for memory allocations
7979
vector<float> fillPattern; // Pattern of floats used to fill source data
80+
vector<int> fillCompress; // Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
8081
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
8182
int validateSource; // Validate source GPU memory immediately after preparation
8283

@@ -137,6 +138,7 @@ class EnvVars
137138
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
138139
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
139140
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
141+
fillCompress = GetEnvVarArray("FILL_COMPRESS" , {});
140142
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
141143
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
142144
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
@@ -314,6 +316,7 @@ class EnvVars
314316
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
315317
#endif
316318
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
319+
printf(" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
317320
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
318321
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
319322
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
@@ -400,6 +403,8 @@ class EnvVars
400403
#endif
401404
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
402405
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
406+
Print("FILL_COMPRESS", getenv("FILL_COMPRESS") ? 1 : 0,
407+
"%s", (fillCompress.size() ? GetStr(fillCompress).c_str() : "Not specified"));
403408
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
404409
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
405410
Print("GFX_BLOCK_ORDER", gfxBlockOrder,
@@ -493,6 +498,27 @@ class EnvVars
493498
}
494499

495500
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
501+
{
502+
if (getenv(varname.c_str())) {
503+
std::vector<int> values;
504+
char* arrayStr = getenv(varname.c_str());
505+
char* token = strtok(arrayStr, ",");
506+
while (token) {
507+
int val;
508+
if (sscanf(token, "%d", &val) == 1) {
509+
values.push_back(val);
510+
} else {
511+
printf("[ERROR] Unrecognized token [%s]\n", token);
512+
exit(1);
513+
}
514+
token = strtok(NULL, ",");
515+
}
516+
return values;
517+
}
518+
return defaultValue;
519+
}
520+
521+
static std::vector<int> GetEnvVarRangeArray(std::string const& varname, std::vector<int> const& defaultValue)
496522
{
497523
if (getenv(varname.c_str())) {
498524
char* rangeStr = getenv(varname.c_str());
@@ -524,6 +550,15 @@ class EnvVars
524550
return defaultValue;
525551
}
526552

553+
std::string GetStr(std::vector<int> const& varnameList) const {
554+
std::string result = "";
555+
for (int i = 0; i < varnameList.size(); i++) {
556+
if (i) result += ",";
557+
result += std::to_string(varnameList[i]);
558+
}
559+
return result;
560+
}
561+
527562
std::string GetCuMaskDesc() const
528563
{
529564
std::vector<std::pair<int, int>> runs;
@@ -572,9 +607,10 @@ class EnvVars
572607
cfg.data.alwaysValidate = alwaysValidate;
573608
cfg.data.blockBytes = blockBytes;
574609
cfg.data.byteOffset = byteOffset;
610+
cfg.data.fillCompress = fillCompress;
611+
cfg.data.fillPattern = fillPattern;
575612
cfg.data.validateDirect = validateDirect;
576613
cfg.data.validateSource = validateSource;
577-
cfg.data.fillPattern = fillPattern;
578614

579615
cfg.dma.useHipEvents = useHipEvents;
580616
cfg.dma.useHsaCopy = useHsaDma;

src/client/Presets/AllToAllSweep.hpp

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev,
5050
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
5151
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
5252

53+
std::vector<int> blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
5354
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
5455
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
5556

@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev,
7778
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
7879
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
7980
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
81+
ev.Print("BLOCKSIZES" , blockList.size() , EnvVars::ToStr(blockList).c_str());
8082
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
8183
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
8284
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
@@ -180,48 +182,53 @@ void AllToAllSweepPreset(EnvVars& ev,
180182
std::map<std::pair<int, int>, TransferBench::TestResults> results;
181183

182184
// Display summary
183-
printf("#CUs\\Unroll");
184-
for (int u : unrollList) {
185-
printf(" %d(Min) ", u);
186-
if (!showMinOnly) printf(" %d(Max) ", u);
187-
}
188-
printf("\n");
189-
for (int c : numCusList) {
190-
printf(" %5d ", c); fflush(stdout);
185+
for (int blockSize : blockList) {
186+
printf("Blocksize: %d\n", blockSize);
187+
ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
188+
189+
printf("#CUs\\Unroll");
191190
for (int u : unrollList) {
192-
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
193-
for (auto& transfer : transfers)
194-
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
195-
196-
double minBandwidth = std::numeric_limits<double>::max();
197-
double maxBandwidth = std::numeric_limits<double>::min();
198-
TransferBench::TestResults result;
199-
if (TransferBench::RunTransfers(cfg, transfers, result)) {
200-
for (auto const& exeResult : result.exeResults) {
201-
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
202-
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
203-
}
204-
if (useSpray) {
205-
minBandwidth *= targetCount;
206-
maxBandwidth *= targetCount;
207-
}
208-
results[std::make_pair(c,u)] = result;
209-
} else {
210-
minBandwidth = 0.0;
211-
}
212-
printf(" %7.2f ", minBandwidth);
213-
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
214-
fflush(stdout);
191+
printf(" %d(Min) ", u);
192+
if (!showMinOnly) printf(" %d(Max) ", u);
215193
}
216-
printf("\n"); fflush(stdout);
217-
}
218-
219-
if (verbose) {
220-
int testNum = 0;
194+
printf("\n");
221195
for (int c : numCusList) {
196+
printf(" %5d ", c); fflush(stdout);
222197
for (int u : unrollList) {
223-
printf("CUs: %d Unroll %d\n", c, u);
224-
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
198+
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
199+
for (auto& transfer : transfers)
200+
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
201+
202+
double minBandwidth = std::numeric_limits<double>::max();
203+
double maxBandwidth = std::numeric_limits<double>::min();
204+
TransferBench::TestResults result;
205+
if (TransferBench::RunTransfers(cfg, transfers, result)) {
206+
for (auto const& exeResult : result.exeResults) {
207+
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
208+
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
209+
}
210+
if (useSpray) {
211+
minBandwidth *= targetCount;
212+
maxBandwidth *= targetCount;
213+
}
214+
results[std::make_pair(c,u)] = result;
215+
} else {
216+
minBandwidth = 0.0;
217+
}
218+
printf(" %7.2f ", minBandwidth);
219+
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
220+
fflush(stdout);
221+
}
222+
printf("\n"); fflush(stdout);
223+
}
224+
225+
if (verbose) {
226+
int testNum = 0;
227+
for (int c : numCusList) {
228+
for (int u : unrollList) {
229+
printf("CUs: %d Unroll %d\n", c, u);
230+
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
231+
}
225232
}
226233
}
227234
}

0 commit comments

Comments
 (0)