This repository was archived by the owner on Mar 20, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 42
Expand file tree
/
Copy pathcellorder.hpp
More file actions
122 lines (109 loc) · 4.27 KB
/
cellorder.hpp
File metadata and controls
122 lines (109 loc) · 4.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once
#include "coreneuron/utils/memory.h"
#include <algorithm>
namespace coreneuron {
/**
* \brief Function that performs the permutation of the cells such that the
* execution threads access coalesced memory.
*
* \param ith NrnThread to access
* \param ncell number of cells in NrnThread
* \param nnode number of compartments in the ncells
* \param parent parent indices of cells
*
* \return int* order, interleaved order of the cells
*/
int* interleave_order(int ith, int ncell, int nnode, int* parent);
void create_interleave_info();
void destroy_interleave_info();
/**
*
* \brief Solve the Hines matrices based on the interleave_permute_type (1 or 2).
*
* For interleave_permute_type == 1 : Naive interleaving -> Each execution thread deals with one
* Hines matrix (cell) For interleave_permute_type == 2 : Advanced interleaving -> Each Hines matrix
* is solved by multiple execution threads (with coalesced memory access as well)
*/
extern void solve_interleaved(int ith);
class InterleaveInfo; // forward declaration
/**
*
* \brief CUDA branch of the solve_interleaved with interleave_permute_type == 2.
*
* This branch is activated in runtime with the --cuda-interface CLI flag
*/
void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream);
class InterleaveInfo: public UnifiedMemManaged<> {
public:
InterleaveInfo() = default;
InterleaveInfo(const InterleaveInfo&);
InterleaveInfo& operator=(const InterleaveInfo&);
int nwarp = 0; // used only by interleave2
int nstride = 0;
int* stridedispl = nullptr; // interleave2: nwarp+1
int* stride = nullptr; // interleave2: stride length is stridedispl[nwarp]
int* firstnode = nullptr; // interleave2: rootbegin nwarp+1 displacements
int* lastnode = nullptr; // interleave2: nodebegin nwarp+1 displacements
int* cellsize = nullptr; // interleave2: ncycles nwarp
// statistics (nwarp of each)
size_t* nnode = nullptr;
size_t* ncycle = nullptr;
size_t* idle = nullptr;
size_t* cache_access = nullptr;
size_t* child_race = nullptr;
private:
void swap(InterleaveInfo& info);
};
/**
* \brief Function that returns a permutation of length nnode.
*
* There are two permutation strategies:
* For interleave_permute_type == 1 : Naive interleaving -> Each execution thread deals with one
* Hines matrix (cell) For interleave_permute_type == 2 : Advanced interleaving -> Each Hines matrix
* is solved by multiple execution threads (with coalesced memory access as well)
*
* \param ncell number of cells
* \param nnode number of compartments in the ncells
* \param parents parent indices of the cells
* \param nwarp number of warps
* \param nstride nstride is the maximum cell size (not counting root)
* \param stride stride[i] is the number of cells with an ith node:
* using stride[i] we know how many positions to move in order to
* access the next element of the same cell (given that the cells are
* ordered with the treenode_order).
* \param firstnode firstnode[i] is the index of the first nonroot node of the cell
* \param lastnode lastnode[i] is the index of the last node of the cell
* \param cellsize cellsize is the number of nodes in the cell not counting root.
* \param stridedispl
* \return int* : a permutation of length nnode
*/
int* node_order(int ncell,
int nnode,
int* parents,
int& nwarp,
int& nstride,
int*& stride,
int*& firstnode,
int*& lastnode,
int*& cellsize,
int*& stridedispl);
// copy src array to dest with NRN_SOA_BYTE_ALIGN ecalloc_align allocation
template <typename T>
void copy_array(T*& dest, T* src, size_t n) {
dest = static_cast<T*>(allocate_unified(n * sizeof(T)));
std::copy(src, src + n, dest);
}
#ifndef INTERLEAVE_DEBUG
#define INTERLEAVE_DEBUG 0
#endif
#if INTERLEAVE_DEBUG
void mk_cell_indices();
#endif
} // namespace coreneuron