|
30 | 30 | #include <math.h> |
31 | 31 | #include <alloca.h> |
32 | 32 |
|
| 33 | +/* the number of consecutive memcpy for local put/get before calling into MPI */ |
| 34 | +#define NUM_CONSECUTIVE_MEMCPY 16 |
| 35 | + |
| 36 | +/* number of performed local memcpy between calling into MPI */ |
| 37 | +static _Thread_local int num_local_memcpy = 0; |
33 | 38 |
|
34 | 39 | #define CHECK_UNITID_RANGE(_unitid, _team_data) \ |
35 | 40 | do { \ |
@@ -220,13 +225,15 @@ dart__mpi__get_basic( |
220 | 225 | { |
221 | 226 | if (num_reqs) *num_reqs = 0; |
222 | 227 |
|
223 | | - if (team_data->unitid == team_unit_id.id) { |
| 228 | + if (team_data->unitid == team_unit_id.id && |
| 229 | + num_local_memcpy++ < NUM_CONSECUTIVE_MEMCPY) { |
224 | 230 | // use direct memcpy if we are on the same unit |
225 | 231 | memcpy(dest, seginfo->selfbaseptr + offset, |
226 | 232 | nelem * dart__mpi__datatype_sizeof(dtype)); |
227 | 233 | DART_LOG_DEBUG("dart_get: memcpy nelem:%zu " |
228 | 234 | "source (coll.): offset:%lu -> dest: %p", |
229 | 235 | nelem, offset, dest); |
| 236 | + num_local_memcpy = 0; |
230 | 237 | return DART_OK; |
231 | 238 | } |
232 | 239 |
|
@@ -354,12 +361,14 @@ dart__mpi__put_basic( |
354 | 361 | if (num_reqs) *num_reqs = 0; |
355 | 362 |
|
356 | 363 | /* copy data directly if we are on the same unit */ |
357 | | - if (team_unit_id.id == team_data->unitid) { |
| 364 | + if (team_unit_id.id == team_data->unitid && |
| 365 | + num_local_memcpy++ < NUM_CONSECUTIVE_MEMCPY) { |
358 | 366 | if (flush_required_ptr) *flush_required_ptr = false; |
359 | 367 | memcpy(seginfo->selfbaseptr + offset, src, |
360 | 368 | nelem * dart__mpi__datatype_sizeof(dtype)); |
361 | 369 | DART_LOG_DEBUG("dart_put: memcpy nelem:%zu (from global allocation)" |
362 | 370 | "offset: %"PRIu64"", nelem, offset); |
| 371 | + num_local_memcpy = 0; |
363 | 372 | return DART_OK; |
364 | 373 | } |
365 | 374 |
|
|
0 commit comments