Skip to content

Commit 3b3ebdd

Browse files
authored
Revert "Remove size-based partitioning heuristic (#256)" (#376)
* Revert "Remove size-based partitioning heuristic (#256)" This reverts commit cfe828e. * Clang-tidy fixes
1 parent 567a46e commit 3b3ebdd

11 files changed

Lines changed: 203 additions & 14 deletions

File tree

legate/settings.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,48 @@ class LegateRuntimeSettings(Settings):
8181
""",
8282
)
8383

84+
min_gpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
85+
"min_gpu_chunk",
86+
"LEGATE_MIN_GPU_CHUNK",
87+
default=1048576, # 1 << 20
88+
test_default=2,
89+
convert=convert_int,
90+
help="""
91+
If using GPUs, any task operating on arrays smaller than this will
92+
not be parallelized across more than one GPU.
93+
94+
This is a read-only environment variable setting used by the runtime.
95+
""",
96+
)
97+
98+
min_cpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
99+
"min_cpu_chunk",
100+
"LEGATE_MIN_CPU_CHUNK",
101+
default=16384, # 1 << 14
102+
test_default=2,
103+
convert=convert_int,
104+
help="""
105+
If using CPUs, any task operating on arrays smaller than this will
106+
not be parallelized across more than one core.
107+
108+
This is a read-only environment variable setting used by the runtime.
109+
""",
110+
)
111+
112+
min_omp_chunk: EnvOnlySetting[int] = EnvOnlySetting(
113+
"min_omp_chunk",
114+
"LEGATE_MIN_OMP_CHUNK",
115+
default=131072, # 1 << 17
116+
test_default=2,
117+
convert=convert_int,
118+
help="""
119+
If using OpenMP, any task operating on arrays smaller than this will
120+
not be parallelized across more than one OpenMP group.
121+
122+
This is a read-only environment variable setting used by the runtime.
123+
""",
124+
)
125+
84126
window_size: EnvOnlySetting[int] = EnvOnlySetting(
85127
"window_size",
86128
"LEGATE_WINDOW_SIZE",
@@ -94,6 +136,43 @@ class LegateRuntimeSettings(Settings):
94136
""",
95137
)
96138

139+
max_pending_exceptions: EnvOnlySetting[int] = EnvOnlySetting(
140+
"max_pending_exceptions",
141+
"LEGATE_MAX_PENDING_EXCEPTIONS",
142+
default=64,
143+
test_default=1,
144+
convert=convert_int,
145+
help="""
146+
How many possibly-exception-throwing tasks to emit before blocking.
147+
Legate by default does not wait for operations to complete, but instead
148+
"runs ahead" and continues launching work, which will complete
149+
asynchronously. If an operation throws an exception, then by the time
150+
an exception is reported execution may have progressed beyond the
151+
launch of the faulting operation. If you need to check for an exception
152+
at the exact point where it might get thrown (e.g. to catch it and
153+
recover gracefully), set this to 1. Note that this will introduce more
154+
blocking in the control logic of your program, likely reducing overall
155+
utilization.
156+
157+
This is a read-only environment variable setting used by the runtime.
158+
""",
159+
)
160+
161+
precise_exception_trace: EnvOnlySetting[bool] = EnvOnlySetting(
162+
"precise_exception_trace",
163+
"LEGATE_PRECISE_EXCEPTION_TRACE",
164+
default=False,
165+
test_default=False,
166+
convert=convert_bool,
167+
help="""
168+
Whether to capture the stacktrace at the point when a potentially
169+
faulting operation is launched, so a more accurate error location can
170+
be reported in case an exception is thrown.
171+
172+
This is a read-only environment variable setting used by the runtime.
173+
""",
174+
)
175+
97176
field_reuse_frac: EnvOnlySetting[int] = EnvOnlySetting(
98177
"field_reuse_frac",
99178
"LEGATE_FIELD_REUSE_FRAC",
@@ -125,6 +204,23 @@ class LegateRuntimeSettings(Settings):
125204
""",
126205
)
127206

207+
max_lru_length: EnvOnlySetting[int] = EnvOnlySetting(
208+
"max_lru_length",
209+
"LEGATE_MAX_LRU_LENGTH",
210+
default=5,
211+
test_default=1,
212+
convert=convert_int,
213+
help="""
214+
Once the last Store of a given shape is garbage collected, the
215+
resources associated with it are placed on an LRU queue, rather than
216+
getting freed immediately, in case the program creates a Store of the
217+
same shape in the near future. This setting controls the length of that
218+
LRU queue.
219+
220+
This is a read-only environment variable setting used by the runtime.
221+
""",
222+
)
223+
128224
disable_mpi: EnvOnlySetting[bool] = EnvOnlySetting(
129225
"disable_mpi",
130226
"LEGATE_DISABLE_MPI",

src/core/legate_c.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ typedef enum legate_core_tunable_t {
6666
LEGATE_CORE_TUNABLE_TOTAL_GPUS,
6767
LEGATE_CORE_TUNABLE_NUM_NODES,
6868
LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM,
69+
LEGATE_CORE_TUNABLE_MIN_SHARD_VOLUME,
70+
LEGATE_CORE_TUNABLE_WINDOW_SIZE,
6971
LEGATE_CORE_TUNABLE_FIELD_REUSE_SIZE,
72+
LEGATE_CORE_TUNABLE_MAX_LRU_LENGTH,
7073
} legate_core_tunable_t;
7174

7275
typedef enum legate_core_variant_t {

src/core/mapping/detail/core_mapper.cc

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,19 @@ class CoreMapper final : public Mapper {
5858

5959
private:
6060
const LocalMachine machine{};
61+
// TODO(wonchanl): Some of these should be moved to legate::detail::Config
62+
const int64_t min_gpu_chunk{
63+
extract_env("LEGATE_MIN_GPU_CHUNK", MIN_GPU_CHUNK_DEFAULT, MIN_GPU_CHUNK_TEST)};
64+
const int64_t min_cpu_chunk{
65+
extract_env("LEGATE_MIN_CPU_CHUNK", MIN_CPU_CHUNK_DEFAULT, MIN_CPU_CHUNK_TEST)};
66+
const int64_t min_omp_chunk{
67+
extract_env("LEGATE_MIN_OMP_CHUNK", MIN_OMP_CHUNK_DEFAULT, MIN_OMP_CHUNK_TEST)};
68+
const uint32_t window_size{
69+
extract_env("LEGATE_WINDOW_SIZE", WINDOW_SIZE_DEFAULT, WINDOW_SIZE_TEST)};
6170
const uint32_t field_reuse_frac{
6271
extract_env("LEGATE_FIELD_REUSE_FRAC", FIELD_REUSE_FRAC_DEFAULT, FIELD_REUSE_FRAC_TEST)};
72+
const uint32_t max_lru_length{
73+
extract_env("LEGATE_MAX_LRU_LENGTH", MAX_LRU_LENGTH_DEFAULT, MAX_LRU_LENGTH_TEST)};
6374
};
6475

6576
void CoreMapper::set_machine(const legate::mapping::MachineQueryInterface* /*m*/) {}
@@ -91,9 +102,25 @@ Scalar CoreMapper::tunable_value(TunableID tunable_id)
91102
case LEGATE_CORE_TUNABLE_NUM_NODES: {
92103
return Scalar{static_cast<int32_t>(machine.total_nodes)};
93104
}
105+
case LEGATE_CORE_TUNABLE_MIN_SHARD_VOLUME: {
106+
// TODO(wonchanl): make these profile guided
107+
if (machine.has_gpus()) {
108+
// Make sure we can get at least 1M elements on each GPU
109+
return Scalar{min_gpu_chunk};
110+
}
111+
if (machine.has_omps()) {
112+
// Make sure we get at least 128K elements on each OpenMP
113+
return Scalar{min_omp_chunk};
114+
}
115+
// Make sure we can get at least 8KB elements on each CPU
116+
return Scalar{min_cpu_chunk};
117+
}
94118
case LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM: {
95119
return Scalar{machine.has_socket_memory()};
96120
}
121+
case LEGATE_CORE_TUNABLE_WINDOW_SIZE: {
122+
return Scalar{window_size};
123+
}
97124
case LEGATE_CORE_TUNABLE_FIELD_REUSE_SIZE: {
98125
// Multiply this by the total number of nodes and then scale by the frac
99126
const uint64_t global_mem_size =
@@ -102,6 +129,9 @@ Scalar CoreMapper::tunable_value(TunableID tunable_id)
102129
: machine.system_memory().capacity());
103130
return Scalar{global_mem_size / field_reuse_frac};
104131
}
132+
case LEGATE_CORE_TUNABLE_MAX_LRU_LENGTH: {
133+
return Scalar{max_lru_length};
134+
}
105135
default: break;
106136
}
107137
// Illegal tunable variable

src/core/runtime/detail/partition_manager.cc

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@
2323

2424
namespace legate::detail {
2525

26+
PartitionManager::PartitionManager(Runtime* runtime)
27+
{
28+
auto mapper_id = runtime->core_library()->get_mapper_id();
29+
30+
min_shard_volume_ =
31+
runtime->get_tunable<int64_t>(mapper_id, LEGATE_CORE_TUNABLE_MIN_SHARD_VOLUME);
32+
33+
if (LegateDefined(LEGATE_USE_DEBUG)) {
34+
assert(min_shard_volume_ > 0);
35+
}
36+
}
37+
2638
const std::vector<uint32_t>& PartitionManager::get_factors(const mapping::detail::Machine& machine)
2739
{
2840
auto curr_num_pieces = machine.count();
@@ -64,7 +76,7 @@ tuple<uint64_t> PartitionManager::compute_launch_shape(const mapping::detail::Ma
6476
// Prune out any dimensions that are 1
6577
std::vector<size_t> temp_shape{};
6678
std::vector<uint32_t> temp_dims{};
67-
size_t volume = 1;
79+
int64_t volume = 1;
6880

6981
temp_dims.reserve(shape.size());
7082
temp_shape.reserve(shape.size());
@@ -76,23 +88,28 @@ tuple<uint64_t> PartitionManager::compute_launch_shape(const mapping::detail::Ma
7688
}
7789
temp_shape.push_back(extent);
7890
temp_dims.push_back(dim);
79-
volume *= extent;
91+
volume *= static_cast<int64_t>(extent);
8092
}
8193

82-
if (temp_shape.empty()) {
94+
// Figure out how many shards we can make with this array
95+
int64_t max_pieces = (volume + min_shard_volume_ - 1) / min_shard_volume_;
96+
assert(volume == 0 || max_pieces > 0);
97+
// If we can only make one piece return that now
98+
if (max_pieces <= 1) {
8399
return {};
84100
}
85101

86102
// TODO(wonchanl): We need a better heuristic
87-
auto max_pieces = curr_num_pieces;
103+
max_pieces = curr_num_pieces;
88104

89105
// First compute the N-th root of the number of pieces
90106
const auto ndim = temp_shape.size();
107+
assert(ndim > 0);
91108
std::vector<size_t> temp_result{};
92109

93110
if (1 == ndim) {
94111
// Easy one dimensional case
95-
temp_result.push_back(std::min<size_t>(temp_shape.front(), max_pieces));
112+
temp_result.push_back(std::min<size_t>(temp_shape.front(), static_cast<size_t>(max_pieces)));
96113
} else if (2 == ndim) {
97114
if (volume < max_pieces) {
98115
// TBD: Once the max_pieces heuristic is fixed, this should never happen
@@ -112,11 +129,11 @@ tuple<uint64_t> PartitionManager::compute_launch_shape(const mapping::detail::Ma
112129
// try rounding n both up and down
113130
constexpr auto EPSILON = 1e-12;
114131

115-
auto n1 = std::max<size_t>(1, static_cast<size_t>(std::floor(n + EPSILON)));
132+
auto n1 = std::max<int64_t>(1, static_cast<int64_t>(std::floor(n + EPSILON)));
116133
while (max_pieces % n1 != 0) {
117134
--n1;
118135
}
119-
auto n2 = std::max<size_t>(1, static_cast<size_t>(std::floor(n - EPSILON)));
136+
auto n2 = std::max<int64_t>(1, static_cast<int64_t>(std::floor(n - EPSILON)));
120137
while (max_pieces % n2 != 0) {
121138
++n2;
122139
}

src/core/runtime/detail/partition_manager.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,12 @@ struct Machine;
3030

3131
namespace legate::detail {
3232

33+
class Runtime;
34+
3335
class PartitionManager {
3436
public:
37+
explicit PartitionManager(Runtime* runtime);
38+
3539
[[nodiscard]] const std::vector<uint32_t>& get_factors(const mapping::detail::Machine& machine);
3640

3741
[[nodiscard]] tuple<uint64_t> compute_launch_shape(const mapping::detail::Machine& machine,
@@ -67,6 +71,7 @@ class PartitionManager {
6771
Legion::FieldID field_id);
6872

6973
private:
74+
int64_t min_shard_volume_{};
7075
std::unordered_map<uint32_t, std::vector<uint32_t>> all_factors_{};
7176

7277
using TilingCacheKey = std::pair<Legion::IndexSpace, Tiling>;

src/core/runtime/detail/runtime.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ constexpr const char* const TOPLEVEL_NAME = "Legate Core Toplevel Task";
8989

9090
Runtime::Runtime()
9191
: legion_runtime_{Legion::Runtime::get_runtime()},
92-
window_size_{extract_env("LEGATE_WINDOW_SIZE", WINDOW_SIZE_DEFAULT, WINDOW_SIZE_TEST)},
9392
field_reuse_freq_{
9493
extract_env("LEGATE_FIELD_REUSE_FREQ", FIELD_REUSE_FREQ_DEFAULT, FIELD_REUSE_FREQ_TEST)},
9594
force_consensus_match_{!!extract_env("LEGATE_CONSENSUS", CONSENSUS_DEFAULT, CONSENSUS_TEST)}
@@ -198,7 +197,7 @@ void Runtime::initialize(Legion::Context legion_context)
198197
core_library_ = find_library(CORE_LIBRARY_NAME, false /*can_fail*/);
199198
// TODO(jfaibussowit): Use smart pointers for these
200199
communicator_manager_ = new CommunicatorManager{};
201-
partition_manager_ = new PartitionManager{};
200+
partition_manager_ = new PartitionManager{this};
202201
machine_manager_ = new MachineManager{};
203202
provenance_manager_ = new ProvenanceManager{};
204203
Config::has_socket_mem =

src/core/runtime/detail/runtime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ class Runtime {
359359
registered_shardings_{};
360360

361361
std::vector<InternalSharedPtr<Operation>> operations_;
362-
size_t window_size_{};
362+
size_t window_size_{1};
363363
uint64_t next_unique_id_{};
364364

365365
using RegionFieldID = std::pair<Legion::LogicalRegion, Legion::FieldID>;

src/core/task/detail/return.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ struct ReturnedException {
5050
ReturnedException() = default;
5151
ReturnedException(int32_t index, std::string_view error_message);
5252

53-
static inline constexpr auto MAX_MESSAGE_SIZE = 2048;
53+
static inline constexpr auto MAX_MESSAGE_SIZE = 256;
5454

5555
[[nodiscard]] bool raised() const;
5656

src/env_defaults.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,43 @@
1717

1818
#pragma once
1919

20+
#include "legate_defines.h"
21+
22+
// 1 << 20 (need actual number for python to parse)
23+
#define MIN_GPU_CHUNK_DEFAULT 1048576
24+
#define MIN_GPU_CHUNK_TEST 2
25+
26+
// 1 << 14 (need actual number for python to parse)
27+
#define MIN_CPU_CHUNK_DEFAULT 16384
28+
#define MIN_CPU_CHUNK_TEST 2
29+
30+
// 1 << 17 (need actual number for python to parse)
31+
#define MIN_OMP_CHUNK_DEFAULT 131072
32+
#define MIN_OMP_CHUNK_TEST 2
33+
2034
#define WINDOW_SIZE_DEFAULT 1
2135
#define WINDOW_SIZE_TEST 1
2236

37+
#if LegateDefined(LEGATE_USE_DEBUG)
38+
// In debug mode, the default is always block on tasks that can throw exceptions
39+
#define MAX_PENDING_EXCEPTIONS_DEFAULT 1
40+
#else
41+
#define MAX_PENDING_EXCEPTIONS_DEFAULT 64
42+
#endif
43+
#define MAX_PENDING_EXCEPTIONS_TEST 1
44+
45+
#define PRECISE_EXCEPTION_TRACE_DEFAULT 0
46+
#define PRECISE_EXCEPTION_TRACE_TEST 0
47+
2348
#define FIELD_REUSE_FRAC_DEFAULT 256
2449
#define FIELD_REUSE_FRAC_TEST 1
2550

2651
#define FIELD_REUSE_FREQ_DEFAULT 32
2752
#define FIELD_REUSE_FREQ_TEST 8
2853

54+
#define MAX_LRU_LENGTH_DEFAULT 5
55+
#define MAX_LRU_LENGTH_TEST 1
56+
2957
#define DISABLE_MPI_DEFAULT 0
3058
#define DISABLE_MPI_TEST 0
3159

tests/unit/legate/core/util/task_util.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,7 @@ def deep_clone() -> TestFunction[_P, _T]:
200200
fn_copy.self = fn_copy
201201

202202
def mark_called() -> None:
203-
# FIXME: This assertion doesn't hold if more than one Python
204-
# task gets launched.
205-
# assert not fn_copy.self.called, f"{fn}, {fn_copy.self}"
203+
assert not fn_copy.self.called, f"{fn}, {fn_copy.self}"
206204
fn_copy.self.called = True
207205

208206
fn_copy.self.mark_called = mark_called

0 commit comments

Comments
 (0)