Skip to content

Commit 3e2fdcc

Browse files
committed
fixes
fix: some data races and more
1 parent 9c6b302 commit 3e2fdcc

9 files changed

Lines changed: 282 additions & 34 deletions

File tree

CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,19 @@ project(dsr
66

77
include(GNUInstallDirs)
88

9+
# -DTSAN=ON enables ThreadSanitizer across the whole build (library + tests/benchmarks).
10+
# Must be applied before any add_subdirectory so every TU is instrumented.
11+
# Incompatible with SANITIZER (ASan/UBSan) — enforce that here.
12+
option(TSAN "Enable ThreadSanitizer" OFF)
13+
if (TSAN)
14+
if (SANITIZER)
15+
message(FATAL_ERROR "TSAN and SANITIZER (ASan+UBSan) are mutually exclusive")
16+
endif()
17+
message(STATUS "ThreadSanitizer enabled")
18+
add_compile_options(-fsanitize=thread -fno-omit-frame-pointer)
19+
add_link_options(-fsanitize=thread)
20+
endif()
21+
922
add_definitions(-I/usr/include/x86_64-linux-gnu/qt6/QtOpenGLWidgets/)
1023

1124
include_directories(/home/robocomp/robocomp/classes)

api/dsr_api.cpp

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,11 +1093,20 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
10931093
};
10941094

10951095
std::optional<std::unordered_set<std::pair<uint64_t, std::string>,hash_tuple>> cache_map_to_edges = {};
1096+
// Snapshot the data needed for signal emission while the lock is held.
1097+
// nodes.at(id) must NOT be accessed after the lock is released: a concurrent
1098+
// insert_node_/update_node call on the same id runs nodes[id].write() which
1099+
// calls dk.rmv() (clears dk.ds) followed by dk.add(), leaving a window where
1100+
// read_reg()'s assert(dk.ds.size() >= 1) would fire.
1101+
std::string node_type_snapshot;
1102+
std::vector<std::pair<uint64_t, std::string>> from_edges_snapshot;
10961103
{
10971104
std::unique_lock<std::shared_mutex> lock(_mutex);
10981105
if (!deleted.contains(id)) {
10991106
joined = true;
1100-
maybe_deleted_node = (nodes[id].empty()) ? std::nullopt : std::make_optional(nodes.at(id).read_reg());
1107+
if (auto it = nodes.find(id); it != nodes.end() && !it->second.empty()) {
1108+
maybe_deleted_node = it->second.read_reg();
1109+
}
11011110
nodes[id].join(std::move(crdt_delta));
11021111
if (nodes.at(id).empty() or d_empty) {
11031112
nodes.erase(id);
@@ -1106,8 +1115,14 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
11061115
delete_unprocessed_deltas();
11071116
} else {
11081117
signal = true;
1109-
update_maps_node_insert(id, nodes.at(id).read_reg());
1118+
const auto& reg = nodes.at(id).read_reg();
1119+
update_maps_node_insert(id, reg);
11101120
consume_unprocessed_deltas();
1121+
// Snapshot type and outgoing edges before the lock is released.
1122+
node_type_snapshot = reg.type();
1123+
for (const auto &[k, v] : reg.fano()) {
1124+
from_edges_snapshot.emplace_back(k.first, k.second);
1125+
}
11111126
}
11121127
} else {
11131128
delete_unprocessed_deltas();
@@ -1116,11 +1131,11 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
11161131

11171132
if (joined) {
11181133
if (signal) {
1119-
DSR_LOG_DEBUG("[JOIN_NODE] node inserted/updated:", id, nodes.at(id).read_reg().type());
1120-
emitter.update_node_signal(id, nodes.at(id).read_reg().type(), SignalInfo{ mvreg.agent_id() });
1121-
for (const auto &[k, v] : nodes.at(id).read_reg().fano()) {
1122-
DSR_LOG_DEBUG("[JOIN_NODE] add edge FROM:", id, k.first, k.second);
1123-
emitter.update_edge_signal(id, k.first, k.second, SignalInfo{ mvreg.agent_id() });
1134+
DSR_LOG_DEBUG("[JOIN_NODE] node inserted/updated:", id, node_type_snapshot);
1135+
emitter.update_node_signal(id, node_type_snapshot, SignalInfo{ mvreg.agent_id() });
1136+
for (const auto &[to_id, edge_type] : from_edges_snapshot) {
1137+
DSR_LOG_DEBUG("[JOIN_NODE] add edge FROM:", id, to_id, edge_type);
1138+
emitter.update_edge_signal(id, to_id, edge_type, SignalInfo{ mvreg.agent_id() });
11241139
}
11251140

11261141
for (const auto &[k, v]: map_new_to_edges)
@@ -1443,7 +1458,10 @@ std::optional<std::string> DSRGraph::join_delta_edge_attr(IDL::MvregEdgeAttr &&m
14431458

14441459
void DSRGraph::join_full_graph(IDL::OrMap &&full_graph)
14451460
{
1446-
std::vector<std::tuple<bool, uint64_t, std::string, std::optional<CRDTNode>>> updates;
1461+
// 5th element: post-join node snapshot captured inside the lock, used for
1462+
// signal emission after the lock is released to avoid racing with
1463+
// insert_node_/update_node (same pattern as join_delta_node).
1464+
std::vector<std::tuple<bool, uint64_t, std::string, std::optional<CRDTNode>, std::optional<CRDTNode>>> updates;
14471465

14481466
uint64_t id{0}, timestamp{0};
14491467
uint32_t agent_id_ch{0};
@@ -1541,24 +1559,26 @@ void DSRGraph::join_full_graph(IDL::OrMap &&full_graph)
15411559
it->second.join(std::move(mv));
15421560
if (mv_empty or it->second.empty()) {
15431561
update_maps_node_delete(k, nd);
1544-
updates.emplace_back(false, k, "", std::nullopt);
1562+
updates.emplace_back(false, k, "", std::nullopt, std::nullopt);
15451563
delete_unprocessed_deltas();
15461564
} else {
1547-
update_maps_node_insert(k, it->second.read_reg());
1548-
updates.emplace_back(true, k, it->second.read_reg().type(), nd);
1565+
const auto& reg = it->second.read_reg();
1566+
update_maps_node_insert(k, reg);
1567+
updates.emplace_back(true, k, reg.type(), nd, reg);
15491568
consume_unprocessed_deltas();
15501569
}
15511570
}
15521571
}
15531572

15541573
}
1555-
for (auto &[signal, id, type, nd] : updates)
1574+
for (auto &[signal, id, type, nd, current_nd] : updates)
15561575
if (signal) {
1557-
//check what change is joined
1558-
if (!nd.has_value() || nd->attrs() != nodes[id].read_reg().attrs()) {
1559-
emitter.update_node_signal(id, nodes[id].read_reg().type(), SignalInfo{ agent_id_ch });
1560-
} else if (nd.value() != nodes[id].read_reg()) {
1561-
auto iter = nodes[id].read_reg().fano();
1576+
//check what change is joined — use the snapshot captured inside the lock,
1577+
//not nodes[id], which races with concurrent insert_node_/update_node calls.
1578+
if (!nd.has_value() || nd->attrs() != current_nd->attrs()) {
1579+
emitter.update_node_signal(id, type, SignalInfo{ agent_id_ch });
1580+
} else if (nd.value() != *current_nd) {
1581+
const auto& iter = current_nd->fano();
15621582
for (const auto &[k, v] : nd->fano()) {
15631583
if (!iter.contains(k)) {
15641584
emitter.del_edge_signal(id, k.first, k.second, SignalInfo{ agent_id_ch });
@@ -1908,8 +1928,8 @@ void DSRGraph::fullgraph_server_thread()
19081928

19091929
std::pair<bool, bool> DSRGraph::fullgraph_request_thread()
19101930
{
1911-
bool sync = false;
1912-
bool repeated = false;
1931+
std::atomic<bool> sync{false};
1932+
std::atomic<bool> repeated{false};
19131933
auto lambda_request_answer = [&](eprosima::fastdds::dds::DataReader *reader, DSR::DSRGraph *graph)
19141934
{
19151935
while (true)

api/include/dsr/api/dsr_api.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,6 @@ namespace DSR
585585
const bool copy;
586586
std::unique_ptr<Utilities> utils;
587587
std::unordered_set<std::string_view> ignored_attributes;
588-
ThreadPool tp, tp_delta_attr;
589588
bool same_host;
590589
id_generator generator;
591590
GraphSettings::LOGLEVEL log_level;
@@ -678,6 +677,11 @@ namespace DSR
678677
std::unordered_multimap<uint64_t, std::tuple<uint64_t, std::string, mvreg<DSR::CRDTEdge>, uint64_t>> unprocessed_delta_edge_to;
679678
std::unordered_multimap<std::tuple<uint64_t, uint64_t, std::string>, std::tuple<std::string, mvreg<DSR::CRDTAttribute>, uint64_t>, hash_tuple> unprocessed_delta_edge_att;
680679

680+
// ThreadPools are declared after all data they access so that their
681+
// destructors (which join worker threads) run before the data members
682+
// are destroyed, preventing use-after-free data races on shutdown.
683+
ThreadPool tp, tp_delta_attr;
684+
681685
//Custom function for each rtps topic
682686
class NewMessageFunctor {
683687
public:

benchmarks/CMakeLists.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,16 @@ set_target_properties(dsr_benchmarks PROPERTIES
7979

8080
target_compile_options(dsr_benchmarks PUBLIC -g -std=c++23)
8181

82+
# -DTSAN=ON enables ThreadSanitizer. Requires the dsr_api/dsr_core libraries
83+
# to also be built with TSAN=ON (via the root CMakeLists.txt), otherwise TSan
84+
# will report false positives from uninstrumented library code.
85+
option(TSAN "Enable ThreadSanitizer" OFF)
86+
if (TSAN)
87+
message(STATUS "ThreadSanitizer enabled for benchmarks")
88+
target_compile_options(dsr_benchmarks PRIVATE -fsanitize=thread -fno-omit-frame-pointer)
89+
target_link_options(dsr_benchmarks PRIVATE -fsanitize=thread)
90+
endif()
91+
8292
# Include directories
8393
target_include_directories(dsr_benchmarks PRIVATE
8494
${CMAKE_CURRENT_SOURCE_DIR}
@@ -107,6 +117,28 @@ add_custom_command(TARGET dsr_benchmarks POST_BUILD
107117
COMMENT "Creating results directory"
108118
)
109119

120+
# Flamegraph target — generates one SVG per benchmark test case via perf.
121+
# Requires: perf, and FlameGraph scripts (flamegraph.pl + stackcollapse-perf.pl).
122+
# Set FG_DIR to the FlameGraph checkout if the scripts aren't on PATH, e.g.:
123+
# cmake --build . --target flamegraph -j1 -- FG_DIR=/opt/FlameGraph
124+
# Or pass a Catch2 filter to profile a subset:
125+
# cmake --build . --target flamegraph -j1 -- BENCH_FILTER=[LATENCY]
126+
set(FLAMEGRAPH_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/flamegraph.sh)
127+
set(FLAMEGRAPH_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}/results/flamegraphs)
128+
add_custom_target(flamegraph
129+
COMMAND ${CMAKE_COMMAND} -E make_directory ${FLAMEGRAPH_OUTDIR}
130+
COMMAND env
131+
FG_DIR=$ENV{FG_DIR}
132+
${FLAMEGRAPH_SCRIPT}
133+
-b $<TARGET_FILE:dsr_benchmarks>
134+
-o ${FLAMEGRAPH_OUTDIR}
135+
$ENV{BENCH_FILTER}
136+
DEPENDS dsr_benchmarks
137+
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
138+
COMMENT "Generating per-benchmark flamegraphs in ${FLAMEGRAPH_OUTDIR}"
139+
USES_TERMINAL
140+
)
141+
110142
# Register tests with CTest (optional)
111143
# Disabled auto-discovery as it requires running the binary at build time
112144
# which may fail if libraries are not in LD_LIBRARY_PATH

benchmarks/consistency/conflict_rate_bench.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,9 @@ TEST_CASE("Conflict rate benchmarks", "[CONSISTENCY][conflict][.multi]") {
215215
for (int round = 0; round < NUM_ROUNDS; ++round) {
216216
// Both agents try to create the same edge simultaneously
217217
auto edge_a = GraphGenerator::create_test_edge(
218-
node1_id, node2_id, agent_a->get_agent_id(), "conflict_edge");
218+
node1_id, node2_id, agent_a->get_agent_id(), "test_edge");
219219
auto edge_b = GraphGenerator::create_test_edge(
220-
node1_id, node2_id, agent_b->get_agent_id(), "conflict_edge");
220+
node1_id, node2_id, agent_b->get_agent_id(), "test_edge");
221221

222222
std::thread ta([&]() { agent_a->insert_or_assign_edge(edge_a); });
223223
std::thread tb([&]() { agent_b->insert_or_assign_edge(edge_b); });
@@ -228,15 +228,15 @@ TEST_CASE("Conflict rate benchmarks", "[CONSISTENCY][conflict][.multi]") {
228228
fixture.wait_for_sync(std::chrono::milliseconds(200));
229229

230230
// Check both agents see the edge
231-
auto edge_on_a = agent_a->get_edge(node1_id, node2_id, "conflict_edge");
232-
auto edge_on_b = agent_b->get_edge(node1_id, node2_id, "conflict_edge");
231+
auto edge_on_a = agent_a->get_edge(node1_id, node2_id, "test_edge");
232+
auto edge_on_b = agent_b->get_edge(node1_id, node2_id, "test_edge");
233233

234234
if (!edge_on_a.has_value() || !edge_on_b.has_value()) {
235235
conflicts++;
236236
}
237237

238238
// Delete edge for next round
239-
agent_a->delete_edge(node1_id, node2_id, "conflict_edge");
239+
agent_a->delete_edge(node1_id, node2_id, "test_edge");
240240
fixture.wait_for_sync(std::chrono::milliseconds(100));
241241
}
242242

benchmarks/fixtures/graph_generator.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,8 @@ class GraphGenerator {
191191
private:
192192
std::string temp_filename() {
193193
static std::atomic<uint64_t> next_id{0};
194-
return "/tmp/dsr_bench_" + std::to_string(next_id.fetch_add(1, std::memory_order_relaxed)) + ".json";
194+
return "/tmp/dsr_bench_" + std::to_string(getpid()) + "_"
195+
+ std::to_string(next_id.fetch_add(1, std::memory_order_relaxed)) + ".json";
195196
}
196197

197198
std::vector<uint64_t> generate_node_ids(uint32_t count) {

benchmarks/fixtures/multi_agent_fixture.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ class MultiAgentFixture {
6565
qWarning("Can't create agents");
6666
return false;
6767
}
68+
if (config_file.empty()) {
69+
qWarning("create_agents: config_file is empty — graph generator likely failed to write to /tmp (check permissions)");
70+
return false;
71+
}
6872

6973
// Keep agent IDs deterministic while remaining disjoint across fixture
7074
// instances in the same process.

0 commit comments

Comments
 (0)