Merge branch 'refactor_evgen' of github.com:MadGraphTeam/MadGraph7 into refactor_evgen

theoheimel · theoheimel · commit 7ec3fafd0c01 · 2026-04-05T19:04:50.000+02:00
diff --git a/madspace/include/madspace/madcode/function.h b/madspace/include/madspace/madcode/function.h
@@ -53,6 +53,8 @@ class Function {
     ValueVec _locals;
     std::unordered_map<std::string, Value> _globals;
     std::vector<InstructionCall> _instructions;
+
+    friend Function sort_breadth_first(const Function& function);
 };
 
 std::ostream& operator<<(std::ostream& out, const Value& value);
diff --git a/madspace/include/madspace/madcode/optimizer.h b/madspace/include/madspace/madcode/optimizer.h
@@ -10,24 +10,27 @@ class InstructionDependencies {
 public:
     InstructionDependencies(const Function& function);
     bool depends(std::size_t test_index, std::size_t dependency_index) {
-        return matrix[test_index * size + dependency_index];
+        return _matrix[test_index * _size + dependency_index];
     }
+    const std::vector<int>& ranks() const { return _ranks; }
 
 private:
-    std::size_t size;
-    std::vector<bool> matrix;
-    std::vector<int> ranks;
+    std::size_t _size;
+    std::vector<bool> _matrix;
+    std::vector<int> _ranks;
 };
 
 class LastUseOfLocals {
 public:
     LastUseOfLocals(const Function& function);
     std::vector<std::size_t>& local_indices(std::size_t index) {
-        return last_used.at(index);
+        return _last_used.at(index);
     }
 
 private:
-    std::vector<std::vector<std::size_t>> last_used;
+    std::vector<std::vector<std::size_t>> _last_used;
 };
 
+Function sort_breadth_first(const Function& function);
+
 } // namespace madspace
diff --git a/madspace/src/gpu/runtime.cu b/madspace/src/gpu/runtime.cu
@@ -973,20 +973,20 @@ public:
         _stream_count(stream_count), _sync_matrix(stream_count * stream_count, true) {}
 
     bool is_in_sync_with(std::size_t this_stream, std::size_t other_stream) const {
-        return _sync_matrix.at(this_stream * stream_count + other_stream);
+        return _sync_matrix.at(this_stream * _stream_count + other_stream);
     }
     void desynchronize(std::size_t this_stream) {
         for (std::size_t other_stream = 0; other_stream < _stream_count;
-             ++_other_stream) {
+             ++other_stream) {
             if (this_stream != other_stream) {
-                _sync_matrix.at(other_stream * stream_count + this_stream) = false;
+                _sync_matrix.at(other_stream * _stream_count + this_stream) = false;
             }
         }
     }
     void synchronize(std::size_t this_stream, std::size_t other_stream) {
         for (std::size_t i = 0; i < _stream_count; ++i) {
             if (is_in_sync_with(other_stream, i)) {
-                _sync_matrix.at(this_stream * stream_count + i) = true;
+                _sync_matrix.at(this_stream * _stream_count + i) = true;
             }
         }
     }
@@ -995,14 +995,15 @@ public:
 private:
     std::size_t _stream_count;
     std::vector<bool> _sync_matrix;
-}
+};
 
 } // namespace
 
 GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
     _context(context),
-    _input_count(function.inputs().size()) _gpublas_handle(
-        context.thread_pool(),
+    _input_count(function.inputs().size()),
+    _gpublas_handle(
+        context->thread_pool(),
         []() {
             gpublasHandle_t handle;
             check_error(gpublasCreate(&handle));
@@ -1011,7 +1012,7 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
         [](gpublasHandle_t handle) { check_error(gpublasDestroy(handle)); }
     ),
     _gpurand_generator(
-        context.thread_pool(),
+        context->thread_pool(),
         []() {
             gpurandGenerator_t handle;
             check_error(gpurandCreateGenerator(&handle, GPURAND_RNG_PSEUDO_DEFAULT));
@@ -1035,52 +1036,38 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
     std::size_t stream_count = 0, event_count = 0, backward_event_count = 0;
     for (auto& instr : function.instructions()) {
         if (instr.stream_index >= stream_count) {
-            stream_count = instr.stream_index;
+            stream_count = instr.stream_index + 1;
         }
     }
     SyncTracker sync_tracker(stream_count);
     std::vector<int> local_source_streams(function.locals().size(), -1);
     SizeVec last_stream_instrs(stream_count);
-    nested_vector2<std::size_t> local_consumer_streams(function.locals().size());
     nested_vector2<std::size_t> backward_wait_events(function.instructions().size());
     std::vector<int> backward_record_events(function.instructions().size(), -1);
 
-    for (auto& instr : function.instructions()) {
-        if (instr.stream_index >= stream_count) {
-            stream_count = instr.stream_index + 1;
-        }
-    }
-
-    auto update_sync = [&](std::size_t local_index,
-                           std::size_t stream_index,
-                           SizeVec& wait_events,
-                           auto get_event) {
-        int source_stream = local_source_streams.at(local_index);
-        auto& consumer_streams = local_consumer_streams.at(local_index);
-        if (std::find(consumer_streams.begin(), consumer_streams.end(), stream_index) ==
-            consumer_streams.end()) {
-            consumer_streams.push_back(stream_index);
-        }
-        if (!sync_tracker.is_in_sync_with(stream_index, source_stream)) {
-            wait_events.push_back(get_event(source_stream));
-            sync_tracker.synchronize(stream_index, source_stream);
-        }
-    } auto get_event_backward = [&](std::size_t source_stream) -> int {
-        int& event = backward_record_events.at(last_stream_instrs.at(source_stream));
-        if (event == -1) {
-            event = backward_event_count;
-            ++backward_event_count;
-        }
-        return event;
-    };
+    auto update_sync_backward =
+        [&](std::size_t local_index, std::size_t stream_index, SizeVec& wait_events) {
+            int source_stream = local_source_streams.at(local_index);
+            if (source_stream == -1) {
+                return;
+            }
+            if (!sync_tracker.is_in_sync_with(stream_index, source_stream)) {
+                int& event =
+                    backward_record_events.at(last_stream_instrs.at(source_stream));
+                if (event == -1) {
+                    event = backward_event_count;
+                    ++backward_event_count;
+                }
+                wait_events.push_back(event);
+                sync_tracker.synchronize(stream_index, source_stream);
+            }
+        };
 
     for (std::size_t instr_index = 0; auto [instr, bw_wait_events] :
                                       zip(std::views::reverse(function.instructions()),
                                           std::views::reverse(backward_wait_events))) {
         for (auto& out : instr.outputs) {
-            update_sync(
-                out.local_index, instr.stream_index, bw_wait_events, get_event_backward
-            );
+            update_sync_backward(out.local_index, instr.stream_index, bw_wait_events);
         }
         for (auto& in : instr.inputs) {
             local_source_streams.at(in.local_index) = instr.stream_index;
@@ -1090,22 +1077,35 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
         ++instr_index;
     }
     for (auto& in : function.inputs()) {
-        update_sync(in.local_index, 0, _wait_events, get_event_forward);
+        update_sync_backward(in.local_index, 0, _backward_wait_events);
     }
 
     sync_tracker.reset();
-    local_source_streams.clear();
-    last_stream_instrs.clear();
-    local_consumer_streams.clear();
-
-    auto get_event_forward = [&](std::size_t source_stream) -> int {
-        int& event =
-            _instructions.at(last_stream_instrs.at(source_stream)).record_event;
-        if (event == -1) {
-            event = event_count;
-            ++event_count;
+    std::fill(local_source_streams.begin(), local_source_streams.end(), -1);
+    nested_vector2<std::size_t> local_consumer_streams(function.locals().size());
+
+    auto update_sync = [&](std::size_t local_index,
+                           std::size_t stream_index,
+                           SizeVec& wait_events) {
+        int source_stream = local_source_streams.at(local_index);
+        if (source_stream == -1) {
+            return;
+        }
+        auto& consumer_streams = local_consumer_streams.at(local_index);
+        if (std::find(consumer_streams.begin(), consumer_streams.end(), stream_index) ==
+            consumer_streams.end()) {
+            consumer_streams.push_back(stream_index);
+        }
+        if (!sync_tracker.is_in_sync_with(stream_index, source_stream)) {
+            int& event =
+                _instructions.at(last_stream_instrs.at(source_stream)).record_event;
+            if (event == -1) {
+                event = event_count;
+                ++event_count;
+            }
+            wait_events.push_back(event);
+            sync_tracker.synchronize(stream_index, source_stream);
         }
-        return event;
     };
 
     std::vector<bool> is_input(function.locals().size());
@@ -1141,9 +1141,7 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
             if (in.type.batch_size != BatchSize::one) {
                 batch_size_index = in.local_index;
             }
-            update_sync(
-                in.local_index, instr.stream_index, bw_wait_events, get_event_forward
-            );
+            update_sync(in.local_index, instr.stream_index, bw_wait_events);
         }
         SizeVec output_indices;
         std::vector<DataType> output_dtypes;
@@ -1161,6 +1159,7 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
         }
 
         sync_tracker.desynchronize(instr.stream_index);
+        last_stream_instrs.at(instr.stream_index) = _instructions.size();
         _instructions.push_back({
             instr.instruction->opcode(),
             input_indices,
@@ -1176,16 +1175,18 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
             wait_events,
             -1,
             bw_wait_events,
-            bw_record_events,
+            bw_record_event,
         });
 
-        auto locals_to_free = last_use.local_indices(instr_index);
+        SizeVec locals_to_free = last_use.local_indices(instr_index);
         free_queue.insert(
             free_queue.end(), locals_to_free.begin(), locals_to_free.end()
         );
         free_queue.erase(
             std::remove_if(
-                free_queue.begin(), free_queue.end(), [&](std::size_t local_index) {
+                free_queue.begin(),
+                free_queue.end(),
+                [&](std::size_t local_index) {
                     for (std::size_t consumer_stream :
                          local_consumer_streams.at(local_index)) {
                         if (!sync_tracker.is_in_sync_with(
@@ -1213,7 +1214,8 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
                     );
                     return true;
                 }
-            )
+            ),
+            free_queue.end()
         );
 
         ++instr_index;
@@ -1252,11 +1254,11 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
 
     for (auto& out : function.outputs()) {
         _output_indices.push_back(out.local_index);
-        update_sync(out.local_index, 0, _wait_events, get_event_forward);
+        update_sync(out.local_index, 0, _wait_events);
     }
 
     _streams = ThreadResource<std::vector<gpuStream_t>>(
-        context.thread_pool(),
+        context->thread_pool(),
         [stream_count]() {
             std::vector<gpuStream_t> streams(stream_count);
             for (auto& item : streams) {
@@ -1272,7 +1274,7 @@ GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
     );
     std::size_t max_event_count = std::max(event_count, backward_event_count);
     _events = ThreadResource<std::vector<gpuEvent_t>>(
-        context.thread_pool(),
+        context->thread_pool(),
         [max_event_count]() {
             std::vector<gpuEvent_t> events(max_event_count);
             for (auto& item : events) {
@@ -1388,7 +1390,7 @@ std::tuple<TensorVec, TensorVec, std::vector<bool>> GpuRuntime::run_with_grad(
     for (auto index : _output_indices) {
         outputs.push_back(locals[index]);
     }
-    check_error(gpuStreamSynchronize(main_stream);
+    check_error(gpuStreamSynchronize(main_stream));
     return {outputs, locals, eval_grad};
 }
 
@@ -1411,7 +1413,7 @@ GpuRuntime::run_backward(
     gpuStream_t main_stream = streams.at(0);
     for (auto [instr, instr_eval_grad] :
          zip(std::views::reverse(_instructions), std::views::reverse(eval_grad))) {
-        /*gpuStream_t stream = streams.at(instr.backward_stream);
+        /*gpuStream_t stream = streams.at(instr.stream);
         for (auto event : instr.backward_wait_events) {
             check_error(gpuStreamWaitEvent(stream, events.at(event)));
         }*/
@@ -1432,8 +1434,8 @@ GpuRuntime::run_backward(
 #include "runtime_backward_mixin.h"
             }
         }
-        /*if (instr.backward_record_event) {
-            check_error(gpuEventRecord(instr.backward_record_event, stream));
+        /*if (instr.backward_record_event != -1) {
+            check_error(gpuEventRecord(events.at(instr.backward_record_event), stream));
         }*/
     }
     /*for (auto event : _backward_wait_events) {
diff --git a/madspace/src/madcode/optimizer.cpp b/madspace/src/madcode/optimizer.cpp
@@ -10,7 +10,7 @@
 using namespace madspace;
 
 InstructionDependencies::InstructionDependencies(const Function& function) :
-    size(function.instructions().size()), matrix(size * size) {
+    _size(function.instructions().size()), _matrix(_size * _size) {
     std::vector<int> local_source(function.locals().size(), -1);
     int index = 0;
     for (auto& instr : function.instructions()) {
@@ -20,26 +20,26 @@ InstructionDependencies::InstructionDependencies(const Function& function) :
             if (source_index == -1) {
                 continue;
             }
-            matrix.at(index * size + source_index) = true;
-            for (int i = 0; i < size; ++i) {
-                matrix.at(index * size + i) =
-                    matrix.at(index * size + i) | matrix.at(source_index * size + i);
+            _matrix.at(index * _size + source_index) = true;
+            for (int i = 0; i < _size; ++i) {
+                _matrix.at(index * _size + i) =
+                    _matrix.at(index * _size + i) | _matrix.at(source_index * _size + i);
             }
-            int source_rank = ranks.at(source_index);
+            int source_rank = _ranks.at(source_index);
             if (rank < source_rank) {
                 rank = source_rank;
             }
         }
         for (auto& output : instr.outputs) {
             local_source.at(output.local_index) = index;
         }
-        ranks.push_back(rank + 1);
+        _ranks.push_back(rank + 1);
         ++index;
     }
 }
 
 LastUseOfLocals::LastUseOfLocals(const Function& function) :
-    last_used(function.instructions().size()) {
+    _last_used(function.instructions().size()) {
     std::vector<bool> seen_locals;
     for (auto& local : function.locals()) {
         seen_locals.push_back(
@@ -50,7 +50,7 @@ LastUseOfLocals::LastUseOfLocals(const Function& function) :
         seen_locals.at(output.local_index) = true;
     }
     auto instr = function.instructions().rbegin();
-    auto indices = last_used.begin();
+    auto indices = _last_used.begin();
     for (; instr != function.instructions().rend(); ++instr, ++indices) {
         for (auto& input : instr->inputs) {
             auto index = input.local_index;
@@ -60,5 +60,22 @@ LastUseOfLocals::LastUseOfLocals(const Function& function) :
             }
         }
     }
-    std::reverse(last_used.begin(), last_used.end());
+    std::reverse(_last_used.begin(), _last_used.end());
+}
+
+Function madspace::sort_breadth_first(const Function& function) {
+    Function func_out = function;
+    InstructionDependencies dependencies(function);
+    auto order = dependencies.ranks();
+    std::vector<std::size_t> instruction_perm(function.instructions().size());
+    std::iota(instruction_perm.begin(), instruction_perm.end(), 0);
+    std::stable_sort(
+        instruction_perm.begin(), instruction_perm.end(),
+        [&](std::size_t i, std::size_t j) { return order.at(i) < order.at(j); }
+    );
+    func_out._instructions.clear();
+    for (std::size_t index : instruction_perm) {
+        func_out._instructions.push_back(function._instructions.at(index));
+    }
+    return func_out;
 }
diff --git a/madspace/src/phasespace/integrand.cpp b/madspace/src/phasespace/integrand.cpp