diff --git a/engine/main.cpp b/engine/main.cpp
index 2846c78..374e77d 100644
--- a/engine/main.cpp
+++ b/engine/main.cpp
@@ -310,7 +310,7 @@ int main(int argc, char *argv[]) {
 			pool.clear_search_vars();
 			pool.search(pos, rp, 1e9, 12, 1e18, 0);
 			pool.wait_finished();
-			tot_nodes += nodes[0];
+			tot_nodes += nodes[0].get();
 		}
 		uint64_t end = clock();
 		std::cout << tot_nodes << " nodes " << int(tot_nodes / ((double)(end - start) / CLOCKS_PER_SEC)) << " nps" << std::endl;
diff --git a/engine/search.cpp b/engine/search.cpp
index 6fd7899..85a9683 100644
--- a/engine/search.cpp
+++ b/engine/search.cpp
@@ -15,7 +15,7 @@ std::stringstream last_line;
 uint16_t num_threads = 1;
 
 std::atomic<uint64_t> nodecnt[64][64] = {{}};
-alignas(64) std::atomic<uint64_t> nodes[MAX_THREADS] = {};
+NodeCounter nodes[MAX_THREADS];
 std::atomic<uint64_t> tbhits = 0;
 
 uint64_t perft(Position &pos, int depth) {
@@ -192,14 +192,14 @@ bool is_valid_score(Value score) {
  * - Late move reduction (instead of reducing depth, we reduce the search window) (not a known technique, maybe worth trying?)
  */
 Value quiesce(Position &pos, ThreadInfo &ti, Value alpha, Value beta, int side, int ply, bool pv=false) {
-	nodes[ti.id].fetch_add(1, std::memory_order_relaxed);
+	nodes[ti.id]++;
 
 	if (pv) ti.pvlen[ply] = 0;
 
 	if (stop_search) return 0;
 
 	if (ti.is_main) {
-		auto cur_nodes = nodes[ti.id].load(std::memory_order_relaxed);
+		auto cur_nodes = nodes[ti.id].get();
 		if (!(cur_nodes & 4095)) {
 			// The time check is relatively expensive and thus only performed every 4096 nodes
 			auto time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
@@ -370,12 +370,12 @@ Value negamax(Position &pos, ThreadInfo &ti, int depth, Value alpha = -VALUE_INF
 		ti.seldepth = std::max(ti.seldepth, ply);
 	}
 
-	nodes[ti.id].fetch_add(1, std::memory_order_relaxed);
+	nodes[ti.id]++;
 
 	if (stop_search) return 0;
 
 	if (ti.is_main) {
-		auto cur_nodes = nodes[ti.id].load(std::memory_order_relaxed);
+		auto cur_nodes = nodes[ti.id].get();
 		if (!(cur_nodes & 4095)) {
 			// The time check is relatively expensive and thus only performed every 4096 nodes
 			auto time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
@@ -657,7 +657,7 @@ Value negamax(Position &pos, ThreadInfo &ti, int depth, Value alpha = -VALUE_INF
 	int i = 0;
 
 	uint64_t prev_nodes = 0;
-	if (root) prev_nodes = nodes[ti.id].load(std::memory_order_relaxed);
+	if (root) prev_nodes = nodes[ti.id].get();
 
 	ti.line[ply+1].cutoffcnt = 0;
 
@@ -881,7 +881,7 @@ Value negamax(Position &pos, ThreadInfo &ti, int depth, Value alpha = -VALUE_INF
 		ti.line[ply].corr_hist = nullptr;
 
 		if (root) {
-			auto cur_nodes = nodes[ti.id].load(std::memory_order_relaxed);
+			auto cur_nodes = nodes[ti.id].get();
 			nodecnt[move.src()][move.dst()].fetch_add(cur_nodes - prev_nodes, std::memory_order_relaxed);
 			prev_nodes = cur_nodes;
 		}
@@ -1048,7 +1048,7 @@ void iterativedeepening(Position &pos, ThreadInfo &ti, int depth) {
 			uint64_t bm_nodes = nodecnt[best_move.src()][best_move.dst()];
 			uint64_t tot_nodes = 0;
 			for (int t = 0; t < num_threads; t++) {
-				tot_nodes += nodes[t].load(std::memory_order_relaxed); // ig this is dangerous but whatever
+				tot_nodes += nodes[t].get();
 			}
 
 			// UCI output from main thread only
diff --git a/engine/search.hpp b/engine/search.hpp
index 6afc5d6..e1b6010 100644
--- a/engine/search.hpp
+++ b/engine/search.hpp
@@ -50,7 +50,23 @@ extern bool show_wdl;
 extern bool do_softnodes;
 extern bool do_datagen;
 
-extern std::atomic<uint64_t> nodes[MAX_THREADS];
+struct alignas(64) NodeCounter {
+	std::atomic<uint64_t> val = 0;
+
+	void operator++(int) {
+		val.fetch_add(1, std::memory_order_relaxed);
+	}
+
+    void operator=(uint64_t new_val) {
+        val.store(new_val, std::memory_order_relaxed);
+    }
+
+    uint64_t get() const {
+        return val.load(std::memory_order_relaxed);
+    }
+};
+
+extern NodeCounter nodes[MAX_THREADS];
 
 struct ThreadInfo {
 	Position pos;