@@ -31,7 +31,7 @@ int BmaSweepPreset(EnvVars& ev,
3131 }
3232
3333#ifndef BMA_EXEC_ENABLED
34- Utils::Print (" [ERROR] BMA executor requires ROCm 7.0 or newer\n " );
34+ Utils::Print (" [ERROR] BMA executor requires ROCm 7.1 or newer\n " );
3535 return 1 ;
3636#endif
3737
@@ -40,9 +40,11 @@ int BmaSweepPreset(EnvVars& ev,
4040 // Collect env vars for this preset
4141 int exeIndex = EnvVars::GetEnvVar (" EXE_INDEX" , 0 );
4242 int localCopy = EnvVars::GetEnvVar (" LOCAL_COPY" , 0 );
43+ vector<int > gfxSesList = EnvVars::GetEnvVarArray (" GFX_SUB_EXECS" , {});
4344 int gpuMemTypeIdx = EnvVars::GetEnvVar (" GPU_MEM_TYPE" , 0 );
4445 int numGpuDevices = EnvVars::GetEnvVar (" NUM_GPU_DEVICES" , numDetectedGpus);
45- vector<int > numSesList = EnvVars::GetEnvVarArray (" NUM_SUB_EXECS" , {1 ,2 ,4 ,8 });
46+ vector<int > bmaSesList = EnvVars::GetEnvVarArray (" NUM_SUB_EXECS" , {1 ,2 ,4 ,8 });
47+
4648
4749 MemType gpuMemType = Utils::GetGpuMemType (gpuMemTypeIdx);
4850
@@ -54,9 +56,10 @@ int BmaSweepPreset(EnvVars& ev,
5456 if (!outputToCsv) printf (" [BMA Sweep Related]\n " );
5557 ev.Print (" EXE_INDEX" , exeIndex, " Executing on GPU %d" , exeIndex);
5658 ev.Print (" LOCAL_COPY" , localCopy, " %s local copy to GPU %d" , localCopy ? " Including" : " Excluding" , exeIndex);
59+ ev.Print (" GFX_SUB_EXECS" , gfxSesList.size (), EnvVars::ToStr (gfxSesList).c_str ());
5760 ev.Print (" GPU_MEM_TYPE" , gpuMemTypeIdx, " Using %s (%s)" , Utils::GetGpuMemTypeStr (gpuMemTypeIdx).c_str (), Utils::GetAllGpuMemTypeStr ().c_str ());
5861 ev.Print (" NUM_GPU_DEVICES" , numGpuDevices, " Using %d GPUs" , numGpuDevices);
59- ev.Print (" NUM_SUB_EXECS" , numSesList .size (), EnvVars::ToStr (numSesList ).c_str ());
62+ ev.Print (" NUM_SUB_EXECS" , bmaSesList .size (), EnvVars::ToStr (bmaSesList ).c_str ());
6063 printf (" \n " );
6164 }
6265 }
@@ -66,33 +69,40 @@ int BmaSweepPreset(EnvVars& ev,
6669 return 1 ;
6770 }
6871
69- int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0 );
72+ int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0 );
73+ int numBmaSubExec = (int )bmaSesList.size ();
74+ int numGfxSubExec = (int )gfxSesList.size ();
7075
7176 TransferBench::ConfigOptions cfg = ev.ToConfigOptions ();
7277 TransferBench::TestResults results;
7378
7479 // Prepare table of results
7580 int minPow2Exp = 12 ;
7681 int maxPow2Exp = 30 ;
77- int numRows = ( maxPow2Exp - minPow2Exp + 1 ) + 1 ;
78- int numCols = 2 + numSesList. size () ;
82+ int numRows = 1 + (bytesSpecified ? 1 : ( maxPow2Exp - minPow2Exp + 1 )) ;
83+ int numCols = 2 + numBmaSubExec + numGfxSubExec ;
7984
8085 Utils::TableHelper table (numRows, numCols);
81-
82- Utils::Print (" Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n " , numTransfers, exeIndex);
86+ Utils::Print (" Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n " , numTransfers, exeIndex);
8387
8488 // Prepare headers
89+
8590 table.Set (0 , 0 , " Bytes " );
8691 table.Set (0 , 1 , " DMA " );
87- for (int i = 0 ; i < numSesList.size (); i++) {
88- table.Set (0 , 2 +i, " BMA (%d) " , numSesList[i]);
92+ for (int i = 0 ; i < numBmaSubExec; i++) {
93+ table.Set (0 , 2 +i, " BMA(%02d) " , bmaSesList[i]);
94+ }
95+ for (int i = 0 ; i < numGfxSubExec; i++) {
96+ table.Set (0 , 2 +numBmaSubExec+i, " GFX(%02d) " , gfxSesList[i]);
8997 }
98+
9099 table.DrawRowBorder (0 );
91100 table.DrawRowBorder (1 );
92101 table.DrawRowBorder (numRows);
93102 table.DrawColBorder (0 );
94103 table.DrawColBorder (1 );
95104 table.DrawColBorder (2 );
105+ table.DrawColBorder (2 +numBmaSubExec);
96106 table.DrawColBorder (numCols);
97107
98108 if (!ev.outputToCsv ){
@@ -101,6 +111,8 @@ int BmaSweepPreset(EnvVars& ev,
101111 };
102112
103113 for (size_t numBytes = 1ULL <<minPow2Exp, currRow=1 ; numBytes <= (1ULL <<maxPow2Exp); numBytes<<=1 , currRow++) {
114+ if (bytesSpecified) numBytes = numBytesPerTransfer;
115+
104116 if (!ev.outputToCsv ) {
105117 Utils::Print (" ." );
106118 fflush (stdout);
@@ -132,8 +144,8 @@ int BmaSweepPreset(EnvVars& ev,
132144
133145 // BMA executor next
134146 t.exeDevice = {EXE_GPU_BDMA, exeIndex};
135- for (int i = 0 ; i < numSesList. size () ; i++) {
136- t.numSubExecs = numSesList [i];
147+ for (int i = 0 ; i < numBmaSubExec ; i++) {
148+ t.numSubExecs = bmaSesList [i];
137149
138150 if (!TransferBench::RunTransfers (cfg, transfers, results)) {
139151 for (auto const & err : results.errResults )
@@ -143,6 +155,21 @@ int BmaSweepPreset(EnvVars& ev,
143155
144156 table.Set (currRow, 2 +i, " %6.2f " , numTransfers * results.tfrResults [0 ].avgBandwidthGbPerSec );
145157 }
158+
159+ // GFX executor last
160+ t.exeDevice = {EXE_GPU_GFX, exeIndex};
161+ for (int i = 0 ; i < numGfxSubExec; i++) {
162+ t.numSubExecs = gfxSesList[i];
163+
164+ if (!TransferBench::RunTransfers (cfg, transfers, results)) {
165+ for (auto const & err : results.errResults )
166+ Utils::Print (" %s\n " , err.errMsg .c_str ());
167+ return 1 ;
168+ }
169+
170+ table.Set (currRow, 2 +numBmaSubExec+i, " %6.2f " , results.tfrResults [0 ].avgBandwidthGbPerSec );
171+ }
172+ if (bytesSpecified) break ;
146173 }
147174
148175 if (!ev.outputToCsv ) {
0 commit comments