Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ namespace shammodels::gsph {

PatchtreeField<Tscal> interactR_mpi_tree = sptree.make_patch_tree_field(
sched,
shamsys::instance::get_compute_queue(),
shamsys::instance::get_compute_scheduler_ptr(),
interactR_patch,
[](Tscal h0, Tscal h1, Tscal h2, Tscal h3, Tscal h4, Tscal h5, Tscal h6, Tscal h7) {
return sham::max_8points(h0, h1, h2, h3, h4, h5, h6, h7);
Expand Down
9 changes: 3 additions & 6 deletions src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ auto GSPHGhostHandler<vec>::find_interfaces(
using BCShearingPeriodic = typename CfgClass::ShearingPeriodic;

if (BCPeriodic *cfg = std::get_if<BCPeriodic>(&ghost_config)) {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();

for (i32 xoff = -repetition_x; xoff <= repetition_x; xoff++) {
for (i32 yoff = -repetition_y; yoff <= repetition_y; yoff++) {
Expand Down Expand Up @@ -188,8 +187,7 @@ auto GSPHGhostHandler<vec>::find_interfaces(
}
}
} else if (BCShearingPeriodic *cfg = std::get_if<BCShearingPeriodic>(&ghost_config)) {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();

for_each_patch_shift<flt>(*cfg, bsize, [&](i32_3 ioff, ShiftInfo<flt> shift) {
i32 xoff = ioff.x();
Expand Down Expand Up @@ -242,8 +240,7 @@ auto GSPHGhostHandler<vec>::find_interfaces(
});

} else {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();
// sender translation
vec periodic_offset = vec{0, 0, 0};

Expand Down
2 changes: 1 addition & 1 deletion src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ namespace shammodels::sph {

PatchtreeField<flt> interactR_mpi_tree = sptree.make_patch_tree_field(
sched,
shamsys::instance::get_compute_queue(),
shamsys::instance::get_compute_scheduler_ptr(),
interactR_patch,
[](flt h0, flt h1, flt h2, flt h3, flt h4, flt h5, flt h6, flt h7) {
return sham::max_8points(h0, h1, h2, h3, h4, h5, h6, h7);
Expand Down
26 changes: 9 additions & 17 deletions src/shammodels/sph/src/BasicSPHGhosts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,7 @@ auto BasicSPHGhostHandler<vec>::find_interfaces(
base_timer.start();

if (BCPeriodic *cfg = std::get_if<BCPeriodic>(&ghost_config)) {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();

for (i32 xoff = -repetition_x; xoff <= repetition_x; xoff++) {
for (i32 yoff = -repetition_y; yoff <= repetition_y; yoff++) {
Expand All @@ -300,10 +299,8 @@ auto BasicSPHGhostHandler<vec>::find_interfaces(
// sender translation
vec periodic_offset = vec{xoff * bsize.x(), yoff * bsize.y(), zoff * bsize.z()};

sycl::host_accessor tree{
shambase::get_check_ref(sptree.serial_tree_buf), sycl::read_only};
sycl::host_accessor lpid{
shambase::get_check_ref(sptree.linked_patch_ids_buf), sycl::read_only};
auto tree = sptree.serial_tree_buf->template mirror_to<sham::host>();
auto lpid = sptree.linked_patch_ids_buf->template mirror_to<sham::host>();
Comment on lines +302 to +303

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Creating and destroying BufferMirror objects inside the triple loop over periodic offsets is highly inefficient. Each creation triggers a Device-to-Host copy, and each destruction triggers a Host-to-Device copy (write-back). These mirrors should be moved outside the loops (before line 296) to avoid redundant data transfers.


#pragma omp parallel for
for (u32 i = 0; i < sched.patch_list.local.size(); i++) {
Expand Down Expand Up @@ -359,8 +356,7 @@ auto BasicSPHGhostHandler<vec>::find_interfaces(
}
}
} else if (BCShearingPeriodic *cfg = std::get_if<BCShearingPeriodic>(&ghost_config)) {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();

for_each_patch_shift<flt>(*cfg, bsize, [&](i32_3 ioff, ShiftInfo<flt> shift) {
i32 xoff = ioff.x();
Expand All @@ -369,10 +365,8 @@ auto BasicSPHGhostHandler<vec>::find_interfaces(

vec offset = shift.shift;

sycl::host_accessor tree{
shambase::get_check_ref(sptree.serial_tree_buf), sycl::read_only};
sycl::host_accessor lpid{
shambase::get_check_ref(sptree.linked_patch_ids_buf), sycl::read_only};
auto tree = sptree.serial_tree_buf->template mirror_to<sham::host>();
auto lpid = sptree.linked_patch_ids_buf->template mirror_to<sham::host>();
Comment on lines +368 to +369

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the periodic case, creating mirrors inside the for_each_patch_shift lambda causes repeated data transfers for every shift. These mirrors should be moved outside the for_each_patch_shift call (before line 362).


#pragma omp parallel for
for (u32 i = 0; i < sched.patch_list.local.size(); i++) {
Expand Down Expand Up @@ -429,14 +423,12 @@ auto BasicSPHGhostHandler<vec>::find_interfaces(
});

} else {
sycl::host_accessor acc_tf{
shambase::get_check_ref(int_range_max_tree.internal_buf), sycl::read_only};
auto acc_tf = int_range_max_tree.internal_buf->template mirror_to<sham::host>();
// sender translation
vec periodic_offset = vec{0, 0, 0};

sycl::host_accessor tree{shambase::get_check_ref(sptree.serial_tree_buf), sycl::read_only};
sycl::host_accessor lpid{
shambase::get_check_ref(sptree.linked_patch_ids_buf), sycl::read_only};
auto tree = sptree.serial_tree_buf->template mirror_to<sham::host>();
auto lpid = sptree.linked_patch_ids_buf->template mirror_to<sham::host>();

#pragma omp parallel for
for (u32 i = 0; i < sched.patch_list.local.size(); i++) {
Expand Down
4 changes: 2 additions & 2 deletions src/shammodels/sph/src/modules/SPHSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,15 +652,15 @@ void shammodels::sph::modules::SPHSetup<Tvec, SPHKernel>::apply_setup_new(
shambase::throw_unimplemented();
}

sycl::buffer<u64> new_id_buf = sptree.compute_patch_owner(
sham::DeviceBuffer<u64> new_id_buf = sptree.compute_patch_owner(
shamsys::instance::get_compute_scheduler_ptr(),
pos_field.get_buf(),
pos_field.get_obj_cnt());

std::unordered_map<i32, std::vector<u32>> index_per_ranks;
bool err_id_in_newid = false;
{
sycl::host_accessor nid{new_id_buf, sycl::read_only};
std::vector<u64> nid = new_id_buf.copy_to_stdvec();
for (u32 i = 0; i < pos_field.get_obj_cnt(); i++) {
u64 patch_id = nid[i];
bool err = patch_id == u64_max;
Expand Down
9 changes: 7 additions & 2 deletions src/shamrock/include/shamrock/patch/PatchField.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
*/

#include "shambase/DistributedData.hpp"
#include "shambackends/DeviceBuffer.hpp"
#include "shambackends/DeviceScheduler.hpp"
#include "shambackends/sycl.hpp"
#include <memory>
#include <optional>
namespace shamrock::patch {

template<class T>
Expand All @@ -33,10 +36,12 @@ namespace shamrock::patch {
template<class T>
class PatchtreeField {
public:
std::unique_ptr<sycl::buffer<T>> internal_buf;
std::optional<sham::DeviceBuffer<T>> internal_buf;

inline void reset() { internal_buf.reset(); }

inline void allocate(u32 size) { internal_buf = std::make_unique<sycl::buffer<T>>(size); }
inline void allocate(u32 size, sham::DeviceScheduler_ptr sched) {
internal_buf.emplace(size, sched);
}
};
} // namespace shamrock::patch
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ namespace shamrock {
* bound).
*/
template<class T>
shambase::DistributedData<sycl::buffer<u64>> compute_new_pid(
shambase::DistributedData<sham::DeviceBuffer<u64>> compute_new_pid(
SerialPatchTree<T> &sptree, u32 ipos) {

StackEntry stack_loc{};

shambase::DistributedData<sycl::buffer<u64>> newid_buf_map;
shambase::DistributedData<sham::DeviceBuffer<u64>> newid_buf_map;

sched.patch_data.for_each_patchdata([&](u64 id, shamrock::patch::PatchDataLayer &pdat) {
if (!pdat.is_empty()) {
Expand All @@ -84,7 +84,7 @@ namespace shamrock {

bool err_id_in_newid = false;
{
sycl::host_accessor nid{newid_buf_map.get(id), sycl::read_only};
std::vector<u64> nid = newid_buf_map.get(id).copy_to_stdvec();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

copy_to_stdvec() performs a heap allocation and a full buffer copy from device to host. Calling it inside for_each_patchdata is inefficient and can become a performance bottleneck as the number of patches increases. Consider if this logic can be moved to a device kernel or if a more efficient host access method is available.

for (u32 i = 0; i < pdat.get_obj_cnt(); i++) {
bool err = nid[i] == u64_max;
err_id_in_newid = err_id_in_newid || (err);
Expand Down Expand Up @@ -114,7 +114,7 @@ namespace shamrock {
* @return A shared distributed data object containing the extracted patch data.
*/
inline shambase::DistributedDataShared<shamrock::patch::PatchDataLayer> extract_elements(
shambase::DistributedData<sycl::buffer<u64>> new_pid) {
shambase::DistributedData<sham::DeviceBuffer<u64>> &&new_pid) {
shambase::DistributedDataShared<patch::PatchDataLayer> part_exchange;

StackEntry stack_loc{};
Expand All @@ -128,7 +128,7 @@ namespace shamrock {
histogram_extract[current_pid] = 0;
if (!pdat.is_empty()) {

sycl::host_accessor nid{new_pid.get(current_pid), sycl::read_only};
std::vector<u64> nid = new_pid.get(current_pid).copy_to_stdvec();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the usage in compute_new_pid, calling copy_to_stdvec() inside a loop over patches is inefficient due to repeated allocations and data transfers.


if (false) {

Expand Down Expand Up @@ -222,9 +222,10 @@ namespace shamrock {

u32 ipos = sched.pdl_old().get_field_idx<T>(position_field);

DistributedData<sycl::buffer<u64>> new_pid = compute_new_pid(sptree, ipos);
DistributedData<sham::DeviceBuffer<u64>> new_pid = compute_new_pid(sptree, ipos);

DistributedDataShared<patch::PatchDataLayer> part_exchange = extract_elements(new_pid);
DistributedDataShared<patch::PatchDataLayer> part_exchange
= extract_elements(std::move(new_pid));

part_exchange.for_each([](u64 sender, u64 receiver, PatchDataLayer &pdat) {
shamlog_debug_ln("ReattributeDataUtility", sender, receiver, pdat.get_obj_cnt());
Expand Down
Loading