diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 0d7c5e1025..f3f624ebcc 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -146,6 +146,8 @@ def callback_after_mpi_timer(): step_begin=callback_before_mpi_timer, step_end=callback_after_mpi_timer ) + continue + for i in range(10): if shamrock.sys.world_rank() == 0: print("running step ", i + 1, "/", 10, " ...") diff --git a/src/shamalgs/include/shamalgs/buf_checksum.hpp b/src/shamalgs/include/shamalgs/buf_checksum.hpp new file mode 100644 index 0000000000..adfb7a2d28 --- /dev/null +++ b/src/shamalgs/include/shamalgs/buf_checksum.hpp @@ -0,0 +1,35 @@ +// -------------------------------------------------------// +// +// SHAMROCK code for hydrodynamics +// Copyright (c) 2021-2026 Timothée David--Cléris +// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1 +// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information +// +// -------------------------------------------------------// + +#pragma once + +/** + * @file buf_checksum.hpp + * @author Timothée David--Cléris (tim.shamrock@proton.me) + * @brief + * + */ + +#include "shambase/checksum.hpp" +#include "shamalgs/primitives/flatten.hpp" +#include "shambackends/DeviceBuffer.hpp" + +namespace shamalgs { + + template + inline u64 buf_checksum(const sham::DeviceBuffer &buf) { + auto flattened_buf = primitives::flatten_buffer(buf); + + using Tscal = typename shambase::VectorProperties::component_type; + std::vector data = flattened_buf.copy_to_stdvec(); + return shambase::fnv1a_hash( + reinterpret_cast(data.data()), data.size() * sizeof(Tscal)); + } + +} // namespace shamalgs diff --git a/src/shamalgs/src/collective/sparse_exchange.cpp b/src/shamalgs/src/collective/sparse_exchange.cpp index 6e6625544d..970d35d051 100644 --- a/src/shamalgs/src/collective/sparse_exchange.cpp +++ b/src/shamalgs/src/collective/sparse_exchange.cpp @@ -56,6 +56,7 @@ namespace shamalgs::collective { /// fetch u64_2 from global message data std::vector fetch_global_message_data( const std::vector &messages_send) { + __shamrock_stack_entry(); std::vector local_data = std::vector(messages_send.size()); @@ -84,6 +85,7 @@ namespace shamalgs::collective { /// decode message to get message std::vector decode_all_message(const std::vector &global_data) { + __shamrock_stack_entry(); std::vector message_all(global_data.size()); for (u64 i = 0; i < global_data.size(); i++) { message_all[i] = unpack(global_data[i]); @@ -94,6 +96,7 @@ namespace shamalgs::collective { /// compute message tags void compute_tags(std::vector &message_all) { + __shamrock_stack_entry(); std::vector tag_map(shamcomm::world_size(), 0); diff --git a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp index 4200bef3c6..1963cceef8 100644 --- a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp +++ b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp @@ -344,13 +344,13 @@ auto GSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } diff --git a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp index af5f17a9bd..80dacb3af9 100644 --- a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp +++ b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp @@ -86,7 +86,14 @@ namespace shammodels::sph { PatchField interactR_patch = sched.map_owned_to_patch_field_simple( [&](const Patch p, PatchDataLayer &pdat) -> flt { if (!pdat.is_empty()) { +#if false + auto tmp = pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; + shamcomm::logs::raw_ln( + shambase::format("patch {}, Rghost = {}", p.id_patch, tmp)); + return tmp; +#else return pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; +#endif } else { return shambase::VectorProperties::get_min(); } diff --git a/src/shammodels/sph/src/BasicSPHGhosts.cpp b/src/shammodels/sph/src/BasicSPHGhosts.cpp index 862f6ddb40..d96714f229 100644 --- a/src/shammodels/sph/src/BasicSPHGhosts.cpp +++ b/src/shammodels/sph/src/BasicSPHGhosts.cpp @@ -560,13 +560,13 @@ auto BasicSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } diff --git a/src/shammodels/sph/src/modules/NeighbourCache.cpp b/src/shammodels/sph/src/modules/NeighbourCache.cpp index 123e0ddcac..ac708cd16d 100644 --- a/src/shammodels/sph/src/modules/NeighbourCache.cpp +++ b/src/shammodels/sph/src/modules/NeighbourCache.cpp @@ -18,13 +18,21 @@ #include "shambase/aliases_int.hpp" #include "shambase/assert.hpp" #include "shambase/memory.hpp" +#include "shambase/string.hpp" +#include "shamalgs/buf_checksum.hpp" #include "shambackends/DeviceBuffer.hpp" +#include "shamcmdopt/env.hpp" +#include "shamcomm/worldInfo.hpp" #include "shammath/sphkernels.hpp" #include "shammodels/sph/modules/NeighbourCache.hpp" #include "shamsys/legacy/log.hpp" #include "shamtree/TreeTraversal.hpp" #include "shamtree/kernels/geometry_utils.hpp" #include "shamunits/Constants.hpp" +#include +#include + +std::string checksum_prefix = shambase::get_check_ref(shamcmdopt::getenv_str("CHECKSUM_PREFIX")); template class SPHKernel> void shammodels::sph::modules::NeighbourCache::start_neighbors_cache() { @@ -262,6 +270,10 @@ void shammodels::sph::modules::NeighbourCache:: Tscal h_tolerance = solver_config.htol_up_coarse_cycle; + NamedStackEntry stack_loc1ddddd{"wait queue"}; + + shamsys::instance::get_compute_queue().wait_and_throw(); + NamedStackEntry stack_loc1{"init cache"}; // start by counting number of leaf neighbours @@ -269,11 +281,90 @@ void shammodels::sph::modules::NeighbourCache:: sham::DeviceBuffer neigh_count_leaf( leaf_cnt, shamsys::instance::get_compute_scheduler_ptr()); - shamsys::instance::get_compute_queue().wait_and_throw(); - shamlog_debug_sycl_ln("Cache", "generate cache for Nleaf=", leaf_cnt); + std::string checksum_file_path + = checksum_prefix + "/" + fmt::format("patch_{}_debug.txt", patch_id); + { + std::ofstream patch_file(checksum_file_path, std::ios::app); + patch_file << fmt::format( + "patch {} buf_xyz hash={}\n", patch_id, shamalgs::buf_checksum(buf_xyz)); + patch_file << fmt::format( + "patch {} buf_hpart hash={}\n", patch_id, shamalgs::buf_checksum(buf_hpart)); + patch_file << fmt::format( + "patch {} tree_field_rint hash={}\n", + patch_id, + shamalgs::buf_checksum(tree_field_rint)); + // patch_file << fmt::format( + // "patch {} neigh_count_leaf hash={}\n", + // patch_id, + // shamalgs::buf_checksum(neigh_count_leaf)); + patch_file << fmt::format( + "patch {} leaf_it.aabb_min hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.aabb_min)); + patch_file << fmt::format( + "patch {} leaf_it.aabb_max hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.aabb_max)); + patch_file << fmt::format( + "patch {} leaf_it.tree_traverser.buf_lchild_id hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.tree_traverser.buf_lchild_id)); + patch_file << fmt::format( + "patch {} leaf_it.tree_traverser.buf_rchild_id hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.tree_traverser.buf_rchild_id)); + patch_file << fmt::format( + "patch {} leaf_it.tree_traverser.buf_lchild_flag hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.tree_traverser.buf_lchild_flag)); + patch_file << fmt::format( + "patch {} leaf_it.tree_traverser.buf_rchild_flag hash={}\n", + patch_id, + shamalgs::buf_checksum(leaf_it.tree_traverser.buf_rchild_flag)); + patch_file << fmt::format( + "patch {} leaf_it.tree_traverser.offset_leaf = {}\n", + patch_id, + leaf_it.tree_traverser.offset_leaf); + + // other tree fields + auto &tmp1 = tree.structure.buf_endrange; + auto &tmp2 = tree.reduced_morton_set.buf_reduc_index_map; + auto &tmp3 = tree.reduced_morton_set.reduced_morton_codes; + auto &tmp4 = tree.reduced_morton_set.morton_codes_set.sorted_morton_codes; + auto &tmp5 = tree.reduced_morton_set.morton_codes_set.map_morton_id_to_obj_id; + patch_file << fmt::format( + "patch {} tree.structure.buf_endrange hash={}\n", + patch_id, + shamalgs::buf_checksum(tmp1)); + patch_file << fmt::format( + "patch {} tree.reduced_morton_set.buf_reduc_index_map hash={}\n", + patch_id, + shamalgs::buf_checksum(tmp2)); + patch_file << fmt::format( + "patch {} tree.reduced_morton_set.reduced_morton_codes hash={}\n", + patch_id, + shamalgs::buf_checksum(tmp3)); + patch_file << fmt::format( + "patch {} tree.reduced_morton_set.morton_codes_set.sorted_morton_codes hash={}\n", + patch_id, + shamalgs::buf_checksum(tmp4)); + patch_file << fmt::format( + "patch {} tree.reduced_morton_set.morton_codes_set.map_morton_id_to_obj_id " + "hash={}\n", + patch_id, + shamalgs::buf_checksum(tmp5)); + } + + // replay the kernel like a madman + for (u32 i = 0; i < 1000; i++) { + + if (shamcomm::world_rank() == 0 && i % 100 == 0) { + logger::raw_ln(shambase::format("replay the kernel {}/1000", i)); + } + sham::DeviceQueue &q = shamsys::instance::get_compute_scheduler().get_queue(); sham::EventList depends_list; @@ -324,6 +415,22 @@ void shammodels::sph::modules::NeighbourCache:: tree_field_rint.complete_event_state(e); neigh_count_leaf.complete_event_state(e); leaf_it.complete_event_state(e); + + NamedStackEntry stack_loc1ccccc{"wait queue"}; + + shamsys::instance::get_compute_queue().wait_and_throw(); + } + + NamedStackEntry stack_loc1ccccc{"wait queue"}; + + shamsys::instance::get_compute_queue().wait_and_throw(); + + { + std::ofstream patch_file(checksum_file_path, std::ios::app); + patch_file << fmt::format( + "patch {} neigh_count_leaf hash={}\n", + patch_id, + shamalgs::buf_checksum(neigh_count_leaf)); } //{ @@ -342,9 +449,15 @@ void shammodels::sph::modules::NeighbourCache:: // } //} + NamedStackEntry stack_loc1bbbb{"prepare cache"}; + tree::ObjectCache pleaf_cache = tree::prepare_object_cache(std::move(neigh_count_leaf), leaf_cnt); + NamedStackEntry stack_loc1aaaa{"wait queue"}; + + shamsys::instance::get_compute_queue().wait_and_throw(); + // fill ids of leaf neighbours NamedStackEntry stack_loc2{"fill cache"}; diff --git a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp index 8449755de1..b2decdfb4b 100644 --- a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp +++ b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp @@ -269,10 +269,13 @@ class SerialPatchTree { sycl::queue &queue, shamrock::patch::PatchField pfield, Func &&reducer) { + __shamrock_stack_entry(); + shamrock::patch::PatchtreeField ptfield; ptfield.allocate(get_element_count()); { + __shamrock_stack_entry(); sycl::host_accessor lpid{ shambase::get_check_ref(linked_patch_ids_buf), sycl::read_only}; sycl::host_accessor tree_field{ @@ -280,6 +283,8 @@ class SerialPatchTree { // init reduction std::unordered_map &idp_to_gid = sched.patch_list.id_patch_to_global_idx; + +#pragma omp parallel for for (u64 idx = 0; idx < get_element_count(); idx++) { tree_field[idx] = (lpid[idx] != u64_max) ? pfield.get(lpid[idx]) : T(); } diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp index c87577db9a..5edb418156 100644 --- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp +++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp @@ -30,6 +30,28 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { auto &ghost_layer = edges.ghost_layer; const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner; +#if false + std::unordered_map msg_sizes_send; + std::unordered_map msg_sizes_max_send; + + std::stringstream ss; + ss << "Rank " << shamcomm::world_rank() << " is sending " + << ghost_layer.patchdatas.get_native().size() << " patches sizes:"; + for (auto &pdat : ghost_layer.patchdatas.get_native()) { + u64 key = rank_owner.get_rank_owner(pdat.first.first); + // ss << pdat.first.first << " " << pdat.first.second << " " << pdat.second.get_obj_cnt() << + // "\n"; + msg_sizes_send[key] += pdat.second.get_obj_cnt(); + msg_sizes_max_send[key] = std::max(msg_sizes_max_send[key], u64(pdat.second.get_obj_cnt())); + } + for (auto &[rank, size] : msg_sizes_send) { + ss << "\n" + << "msg size from rank " << rank << " is " << size << " max is " + << msg_sizes_max_send[rank]; + } + shamcomm::logs::raw_ln(ss.str()); +#endif + shambase::DistributedDataShared recv_dat; shamalgs::collective::serialize_sparse_comm( diff --git a/src/tests/shamalgs/numeric/exclusiveScanTests.cpp b/src/tests/shamalgs/numeric/exclusiveScanTests.cpp index cd68f6a52b..a822a4c2ab 100644 --- a/src/tests/shamalgs/numeric/exclusiveScanTests.cpp +++ b/src/tests/shamalgs/numeric/exclusiveScanTests.cpp @@ -145,14 +145,17 @@ struct TestExclScanUSM { void check() { if constexpr (std::is_same::value) { - u32 len_test = 1e5; + u32 len_test = 32e6; - std::vector data = shamalgs::primitives::mock_vector(0x111, len_test, 0, 10); + std::vector data = shamalgs::primitives::mock_vector( + 0x111 + shambase::details::get_wtime() * 100000000, len_test, 0, 60); std::vector data_buf(data); std::exclusive_scan(data.begin(), data.end(), data.begin(), 0); + std::cout << "total = " << data[len_test - 1] + data_buf[len_test - 1] << std::endl; + sham::DeviceBuffer buf{ data_buf.size(), shamsys::instance::get_compute_scheduler_ptr()}; buf.copy_from_stdvec(data_buf); @@ -482,7 +485,10 @@ TestStart( TestExclScanUSM test( (TestExclScanUSM::vFunctionCall) shamalgs::numeric::details::exclusive_sum_atomic_decoupled_v5_usm); - test.check(); + + for (u32 i = 0; i < 1000; i++) { + test.check(); + } } #endif