diff --git a/NPB/CG/Makefile b/NPB/CG/Makefile new file mode 100644 index 0000000..688d3b8 --- /dev/null +++ b/NPB/CG/Makefile @@ -0,0 +1,20 @@ +SHELL=/bin/sh +BENCHMARK=cg +BENCHMARKU=CG + +include ../config/make.def + +OBJS = cg.o ${COMMON}/c_print_results.o \ + ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} + +cg.o: cg.cpp npbparams.hpp + ${CCOMPILE} cg.cpp + +clean: + - rm -f *.o *~ + - rm -f npbparams.hpp core diff --git a/NPB/CG/cg.cpp b/NPB/CG/cg.cpp new file mode 100644 index 0000000..ddeb6f3 --- /dev/null +++ b/NPB/CG/cg.cpp @@ -0,0 +1,1055 @@ +/*-------------------------------------------------------------------- + + Information on NAS Parallel Benchmarks is available at: + + http://www.nas.nasa.gov/Software/NPB/ + + Authors: M. Yarrow + C. Kuszmaul + + STL version: + Nicco Mietzsch + + CPP and OpenMP version: + Dalvan Griebler + Júnior Löff + +--------------------------------------------------------------------*/ + +/* +c--------------------------------------------------------------------- +c Note: please observe that in the routine conj_grad three +c implementations of the sparse matrix-vector multiply have +c been supplied. The default matrix-vector multiply is not +c loop unrolled. The alternate implementations are unrolled +c to a depth of 2 and unrolled to a depth of 8. Please +c experiment with these to find the fastest for your particular +c architecture. If reporting timing results, any of these three may +c be used without penalty. +c--------------------------------------------------------------------- +*/ +#include + +#include +#include "npbparams.hpp" +#include "npb-CPP.hpp" + +#define NZ NA*(NONZER+1)*(NONZER+1)+NA*(NONZER+2) + +#define TIMER_ENABLED FALSE + +/* global variables */ + +/* common /partit_size/ */ +static int naa; +static int nzz; +static int firstrow; +static int lastrow; +static int firstcol; +static int lastcol; + +/* common /urando/ */ +static double amult; +static double tran; + +/* function declarations */ +static void conj_grad (dash::Array> &colidx, dash::Array &rowstr, dash::Array &x, + dash::Array &z, dash::Array> &al, dash::Array &p, dash::Array &q, + dash::Array &r, dash::Array &w, double *rnorm); +static void makea(int n, int nz, double a[], int colidx[], int rowstr[], + int nonzer, int firstrow, int lastrow, int firstcol, + int lastcol, double rcond, int arow[], int acol[], + double aelt[], double v[], int iv[], double shift ); +static void sparse(double a[], int colidx[], int rowstr[], int n, + int arow[], int acol[], double aelt[], + int firstrow, int lastrow, + double x[], boolean mark[], int nzloc[], int nnza); +static void sprnvc(int n, int nz, double v[], int iv[], int nzloc[], int mark[]); +static int icnvrt(double x, int ipwr2); +static void vecset(int n, double v[], int iv[], int *nzv, int i, double val); + +/*-------------------------------------------------------------------- + program cg +--------------------------------------------------------------------*/ + +int main(int argc, char **argv) +{ + + dash::init(&argc, &argv); + + dash::Array a_global(NZ+1); + dash::Array rowstr_global(NA+1+1); + dash::Array colidx_global(NZ+1); + + dash::Array x(NA+1); /* x[1:NA] */ + dash::Array z(NA+1); /* z[1:NA] */ + dash::Array p(NA+1); /* p[1:NA] */ + dash::Array q(NA+1); /* q[1:NA] */ + dash::Array r(NA+1); /* r[1:NA] */ + dash::Array w(NA+1); /* w[1:NA] */ + + + int i, j, k, it; + int nthreads = dash::size(); + double zeta; + double rnorm; + double norm_temp11; + double norm_temp12; + double t, mflops; + char class_npb; + boolean verified; + double zeta_verify_value, epsilon; + + firstrow = 1; + lastrow = NA; + firstcol = 1; + lastcol = NA; + + if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { + class_npb = 'S'; + zeta_verify_value = 8.5971775078648; + } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { + class_npb = 'W'; + zeta_verify_value = 10.362595087124; + } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { + class_npb = 'A'; + zeta_verify_value = 17.130235054029; + } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { + class_npb = 'B'; + zeta_verify_value = 22.712745482631; + } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { + class_npb = 'C'; + zeta_verify_value = 28.973605592845; + } else { + class_npb = 'U'; + } + + if ( 0 == dash::myid() ) { + + printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version"" - CG Benchmark\n"); + printf("\n\n Developed by: Dalvan Griebler \n"); + printf("\n\n DASH version by: Nicco Mietzsch \n"); + printf(" Size: %10d\n", NA); + printf(" Iterations: %5d\n", NITER); + + + naa = NA; + nzz = NZ; + + /*-------------------------------------------------------------------- + c Initialize random number generator + c-------------------------------------------------------------------*/ + tran = 314159265.0; + amult = 1220703125.0; + zeta = randlc( &tran, amult ); + + /*-------------------------------------------------------------------- + c + c-------------------------------------------------------------------*/ + static int colidx_init[NZ+1]; /* colidx[1:NZ] */ + static int rowstr_init[NA+1+1]; /* rowstr[1:NA+1] */ + + static double a_init[NZ+1]; /* a[1:NZ] */ + + + + static int iv[2*NA+1+1]; /* iv[1:2*NA+1] */ + static int arow[NZ+1]; /* arow[1:NZ] */ + static int acol[NZ+1]; /* acol[1:NZ] */ + + static double v[NA+1+1]; /* v[1:NA+1] */ + static double aelt[NZ+1]; /* aelt[1:NZ] */ + + + makea(naa, nzz, a_init, colidx_init, rowstr_init, NONZER, + firstrow, lastrow, firstcol, lastcol, + RCOND, arow, acol, aelt, v, iv, SHIFT); + + for (i = 0; i < NZ+1; ++i) colidx_global[i] = colidx_init[i]; + for (i = 0; i < NA+1+1; ++i) rowstr_global[i] = rowstr_init[i]; + for (i = 0; i < NZ+1; ++i) a_global[i] = a_init[i]; + + + /*-------------------------------------------------------------------- + c set starting vector to (1, 1, .... 1) + c-------------------------------------------------------------------*/ + + for (i = 1; i <= NA; i++) { + x[i] = 1.0; + } + + } + dash::barrier(); + + //setup local arrays + // first, setup rowstr + + int elem_per_unit = ceil(((double) NA+1)/dash::size()); + + dash::Array rowstr((elem_per_unit+1)*dash::size()); + + for (int i = 0; i < elem_per_unit+1; i++) { + rowstr.local[i] = rowstr_global[min(dash::myid()*elem_per_unit + i, NA+1)]; + } + + + //now setup a and colidx + + using pattern_t = dash::CSRPattern<1>; + using index_t = int; + using extent_t = typename pattern_t::size_type; + + using array_double = dash::Array; + using array_int = dash::Array; + + std::vector local_sizes; + dash::Array my_elem_count(dash::size()); + + my_elem_count.local[0] = rowstr.local[elem_per_unit] - rowstr.local[0]; + + for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) { + local_sizes.push_back(my_elem_count[unit_idx]); + } + + pattern_t pattern(local_sizes); + + array_double a(pattern); + array_int colidx(pattern); + + int offset = 0; + for (int k = 0; k < dash::myid(); ++k) offset += my_elem_count[k]; + + for (int i = 0; i < my_elem_count.local[0]; ++i) { + a.local[i] = a_global[offset + i]; + colidx.local[i] = colidx_global[offset + i]; + } + + for (int i = 0; i < elem_per_unit+1; ++i) { + rowstr.local[i] -= offset; + } + + dash::barrier(); + + zeta = 0.0; + + /*------------------------------------------------------------------- + c----> + c Do one iteration untimed to init all code and data page tables + c----> (then reinit, start timing, to niter its) + c-------------------------------------------------------------------*/ + + for (it = 1; it <= 1; it++) { + + /*-------------------------------------------------------------------- + c The call to the conjugate gradient routine: + c-------------------------------------------------------------------*/ + conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm); + + /*-------------------------------------------------------------------- + c zeta = shift + 1/(x.z) + c So, first: (x.z) + c Also, find norm of z + c So, first: (z.z) + c-------------------------------------------------------------------*/ + dash::Array norm_temp11(dash::size()); + + auto it2 = z.lbegin(); + for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) { + norm_temp11.local[0] += *it1 * *it2; + it2++; + } + + norm_temp11.local[0] = dash::reduce(norm_temp11.begin(), norm_temp11.end(), 0.0, std::plus()); + + dash::Array norm_temp12(dash::size()); + + for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) { + norm_temp12.local[0] += *it1 * *it1; + } + + norm_temp12.local[0] = dash::reduce(norm_temp12.begin(), norm_temp12.end(), 0.0, std::plus()); + + norm_temp12.local[0] = 1.0 / sqrt( norm_temp12.local[0] ); + + /*-------------------------------------------------------------------- + c Normalize z to obtain x + c-------------------------------------------------------------------*/ + + it2 = z.lbegin(); + for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) { + *it1 = *it2 * norm_temp12.local[0]; + it2++; + } + + } /* end of do one iteration untimed */ + + /*-------------------------------------------------------------------- + c set starting vector to (1, 1, .... 1) + c-------------------------------------------------------------------*/ + if (0 == dash::myid()) { + + for (i = 1; i <= NA; i++) { + x[i] = 1.0; + } + + } + + zeta = 0.0; + + dash::barrier(); + + if (0 == dash::myid()) { + + timer_clear( 1 ); + timer_clear( 2 ); + + timer_start( 1 ); + + } + + /*-------------------------------------------------------------------- + c----> + c Main Iteration for inverse power method + c----> + c-------------------------------------------------------------------*/ + + + for (it = 1; it <= NITER; it++) { + + /*-------------------------------------------------------------------- + c The call to the conjugate gradient routine: + c-------------------------------------------------------------------*/ + conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm); + /*-------------------------------------------------------------------- + c zeta = shift + 1/(x.z) + c So, first: (x.z) + c Also, find norm of z + c So, first: (z.z) + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + dash::Array norm_temp11(dash::size()); + + auto it2 = z.lbegin(); + for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) { + norm_temp11.local[0] += *it1 * *it2; + it2++; + } + + norm_temp11.local[0] = dash::reduce(norm_temp11.begin(), norm_temp11.end(), 0.0, std::plus()); + + dash::Array norm_temp12(dash::size()); + + for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) { + norm_temp12.local[0] += *it1 * *it1; + } + + norm_temp12.local[0] = dash::reduce(norm_temp12.begin(), norm_temp12.end(), 0.0, std::plus()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + norm_temp12.local[0] = 1.0 / sqrt( norm_temp12.local[0] ); + + if ( 0 == dash::myid() ) { + + zeta = SHIFT + 1.0 / norm_temp11.local[0]; + + if ( it == 1 ) { + printf(" iteration ||r|| zeta\n"); + } + + printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); + + } + + /*-------------------------------------------------------------------- + c Normalize z to obtain x + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + it2 = z.lbegin(); + for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) { + *it1 = *it2 * norm_temp12.local[0]; + it2++; + } + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + /* end of main iter inv pow meth */ + + } + + dash::barrier(); + + if (0 == dash::myid()) { + + + timer_stop( 1 ); + /*-------------------------------------------------------------------- + c End of timed section + c-------------------------------------------------------------------*/ + + t = timer_read( 1 ); + + printf(" Benchmark completed\n"); + + epsilon = 1.0e-10; + if (class_npb != 'U') { + if (fabs(zeta - zeta_verify_value) <= epsilon) { + verified = TRUE; + printf(" VERIFICATION SUCCESSFUL\n"); + printf(" Zeta is %20.12e\n", zeta); + printf(" Error is %20.12e\n", zeta - zeta_verify_value); + } else { + verified = FALSE; + printf(" VERIFICATION FAILED\n"); + printf(" Zeta %20.12e\n", zeta); + printf(" The correct zeta is %20.12e\n", zeta_verify_value); + } + } else { + verified = FALSE; + printf(" Problem size unknown\n"); + printf(" NO VERIFICATION PERFORMED\n"); + } + if ( t != 0.0 ) { + mflops = (2.0*NITER*NA) + * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) + / t / 1000000.0; + } else { + mflops = 0.0; + } + c_print_results((char*)"CG", class_npb, NA, 0, 0, NITER, nthreads, t, mflops, (char*)" floating point", verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); + + } + + return 0; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ +static void conj_grad ( + dash::Array> &colidx, /* colidx[1:nzz] */ + dash::Array &rowstr, /* rowstr[1:naa+1] */ + dash::Array &x, /* x[*] */ + dash::Array &z, /* z[*] */ + dash::Array> &a, /* a[1:nzz] */ + dash::Array &p, /* p[*] */ + dash::Array &q, /* q[*] */ + dash::Array &r, /* r[*] */ + dash::Array &w, /* w[*] */ + double *rnorm ) +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------- +c Floaging point arrays here are named as in NPB1 spec discussion of +c CG algorithm +c---------------------------------------------------------------------*/ +{ + //static double d, sum, rho, rho0, alpha, beta; + //double d, sum, rho, rho0, alpha, beta; + dash::Array d(dash::size()); + dash::Array sum(dash::size()); + dash::Array rho(dash::size()); + dash::Array rho0(dash::size()); + dash::Array alpha(dash::size()); + dash::Array beta(dash::size()); + int j; + int cgit, cgitmax = 25; + + + rho.local[0] = 0.0; + + int elem_per_unit = ceil(((double) NA+1)/dash::size()); + + /*-------------------------------------------------------------------- + c Initialize the CG algorithm: + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + dash::fill(q.begin(), q.end(), 0.0); + dash::fill(z.begin(), z.end(), 0.0); + dash::fill(w.begin(), w.end(), 0.0); + + std::copy(x.lbegin(), x.lend(), r.lbegin()); + std::copy(r.lbegin(), r.lend(), p.lbegin()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + /*-------------------------------------------------------------------- + c rho = r.r + c Now, obtain the norm of r: First, sum squares of r elements locally... + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + for (auto it1 = r.lbegin(); it1 != r.lend(); it1++) { + rho.local[0] = rho.local[0] + *it1 * *it1; + } + + rho.local[0] = dash::reduce(rho.begin(), rho.end(), 0.0, std::plus()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + /*-------------------------------------------------------------------- + c----> + c The conj grad iteration loop + c----> + c-------------------------------------------------------------------*/ + + for (cgit = 1; cgit <= cgitmax; cgit++) { + + rho0.local[0] = rho.local[0]; + d.local[0] = 0.0; + rho.local[0] = 0.0; + + + /*-------------------------------------------------------------------- + c q = A.p + c The partition submatrix-vector multiply: use workspace w + c--------------------------------------------------------------------- + C + C NOTE: this version of the multiply is actually (slightly: maybe %5) + C faster on the sp2 on 16 nodes than is the unrolled-by-2 version + C below. On the Cray t3d, the reverse is true, i.e., the + C unrolled-by-two version is some 10% faster. + C The unrolled-by-8 version below is significantly faster + C on the Cray t3d - overall speed of code is 1.5 times faster. + */ + + double p_local[p.size()]; + dash::copy(p.begin(), p.end(), &p_local[0]); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + // rolled version + + for (int j = 0; j < elem_per_unit; j++) { + double sum = 0.0; + for (int k = rowstr.local[j]; k < rowstr.local[j+1]; k++) { + sum = sum + a.local[k]*p_local[colidx.local[k]]; + } + w.local[j] = sum; + } + + + //unrolled-by-two version + /* + for (int j = 0; j < elem_per_unit; j++) { + int iresidue; + double sum1, sum2; + int i = rowstr.local[j]; + iresidue = (rowstr.local[j+1]-i) % 2; + sum1 = 0.0; + sum2 = 0.0; + if (iresidue == 1) sum1 = sum1 + a.local[i]*p_local[colidx.local[i]]; + for (int k = i+iresidue; k <= rowstr.local[j+1]-2; k += 2) { + sum1 = sum1 + a.local[k] * p_local[colidx.local[k]]; + sum2 = sum2 + a.local[k+1] * p_local[colidx.local[k+1]]; + } + w.local[j] = sum1 + sum2; + }*/ + + + //unrolled-by-8 version + /* + for (int j = 0; j < elem_per_unit; j++) { + int iresidue, k; + int i = rowstr.local[j]; + iresidue = (rowstr.local[j+1]-i) % 8; + double sum = 0.0; + for (k = i; k <= i+iresidue-1; k++) { + sum = sum + a.local[k] * p_local[colidx.local[k]]; + } + for (k = i+iresidue; k <= rowstr.local[j+1]-8; k += 8) { + sum = sum + + a.local[k ] * p_local[colidx.local[k ]] + + a.local[k+1] * p_local[colidx.local[k+1]] + + a.local[k+2] * p_local[colidx.local[k+2]] + + a.local[k+3] * p_local[colidx.local[k+3]] + + a.local[k+4] * p_local[colidx.local[k+4]] + + a.local[k+5] * p_local[colidx.local[k+5]] + + a.local[k+6] * p_local[colidx.local[k+6]] + + a.local[k+7] * p_local[colidx.local[k+7]]; + } + w.local[j] = sum; + }*/ + + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + std::copy(w.lbegin(), w.lend(), q.lbegin()); + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*-------------------------------------------------------------------- + c Clear w for reuse... Do we really need that? Commented it out for the moment. + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + //dash::fill(w.begin(), w.end(), 0.0); + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*-------------------------------------------------------------------- + c Obtain p.q + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + auto it2 = p.lbegin(); + for (auto it1 = q.lbegin(); it1 != q.lend(); ++it1) { + d.local[0] += *it1 * *it2; + it2++; + } + + d.local[0] = dash::reduce(d.begin(), d.end(), 0.0, std::plus()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*-------------------------------------------------------------------- + c Obtain alpha = rho / (p.q) + c-------------------------------------------------------------------*/ + + alpha.local[0] = rho0.local[0] / d.local[0]; + + /*-------------------------------------------------------------------- + c Save a temporary of rho + c-------------------------------------------------------------------*/ + /* rho0 = rho;*/ + + /*--------------------------------------------------------------------- + c Obtain z = z + alpha*p + c and r = r - alpha*q + c---------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + it2 = p.lbegin(); + for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) { + *it1 = *it1 + alpha.local[0]* *it2; + it2++; + } + + it2 = q.lbegin(); + for (auto it1 = r.lbegin(); it1 != r.lend(); ++it1) { + *it1 = *it1 - alpha.local[0]* *it2; + it2++; + } + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*--------------------------------------------------------------------- + c rho = r.r + c Now, obtain the norm of r: First, sum squares of r elements locally... + c---------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + for (auto it1 = r.lbegin(); it1 != r.lend(); ++it1) { + rho.local[0] += *it1 * *it1; + } + + rho.local[0] = dash::reduce(rho.begin(), rho.end(), 0.0, std::plus()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*-------------------------------------------------------------------- + c Obtain beta: + c-------------------------------------------------------------------*/ + + beta.local[0] = rho.local[0] / rho0.local[0]; + + /*-------------------------------------------------------------------- + c p = r + beta*p + c-------------------------------------------------------------------*/ + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + it2 = r.lbegin(); + for (auto it1 = p.lbegin(); it1 != p.lend(); ++it1) { + *it1 = *it2 + beta.local[0]* *it1; + it2++; + } + dash::barrier(); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + } /* end of do cgit=1,cgitmax */ + + /*--------------------------------------------------------------------- + c Compute residual norm explicitly: ||r|| = ||x - A.z|| + c First, form A.z + c The partition submatrix-vector multiply + c---------------------------------------------------------------------*/ + + sum.local[0] = 0.0; + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + double z_local[p.size()]; + dash::copy(z.begin(), z.end(), &z_local[0]); + + for (int j = 0; j < elem_per_unit; j++) { + double sum = 0.0; + for (int k = rowstr.local[j]; k < rowstr.local[j+1]; k++) { + sum = sum + a.local[k]*z_local[colidx.local[k]]; + } + w.local[j] = sum; + } + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + std::copy(w.lbegin(), w.lend(), r.lbegin()); + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + /*-------------------------------------------------------------------- + c At this point, r contains A.z + c-------------------------------------------------------------------*/ + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2); + + auto it2 = r.lbegin(); + for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) { + sum.local[0] += (*it1 - *it2)*(*it1 - *it2); + it2++; + } + + sum.local[0] = dash::reduce(sum.begin(), sum.end(), 0.0, std::plus()); + + if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2); + + (*rnorm) = sqrt(sum.local[0]); +} + +/*--------------------------------------------------------------------- +c generate the test problem for benchmark 6 +c makea generates a sparse matrix with a +c prescribed sparsity distribution +c +c parameter type usage +c +c input +c +c n i number of cols/rows of matrix +c nz i nonzeros as declared array size +c rcond r*8 condition number +c shift r*8 main diagonal shift +c +c output +c +c a r*8 array for nonzeros +c colidx i col indices +c rowstr i row pointers +c +c workspace +c +c iv, arow, acol i +c v, aelt r*8 +c---------------------------------------------------------------------*/ +static void makea( + int n, + int nz, + double a[], /* a[1:nz] */ + int colidx[], /* colidx[1:nz] */ + int rowstr[], /* rowstr[1:n+1] */ + int nonzer, + int firstrow, + int lastrow, + int firstcol, + int lastcol, + double rcond, + int arow[], /* arow[1:nz] */ + int acol[], /* acol[1:nz] */ + double aelt[], /* aelt[1:nz] */ + double v[], /* v[1:n+1] */ + int iv[], /* iv[1:2*n+1] */ + double shift ) +{ + int i, nnza, iouter, ivelt, ivelt1, irow, nzv; + + /*-------------------------------------------------------------------- + c nonzer is approximately (int(sqrt(nnza /n))); + c-------------------------------------------------------------------*/ + + double size, ratio, scale; + int jcol; + + size = 1.0; + ratio = pow(rcond, (1.0 / (double)n)); + nnza = 0; + + /*--------------------------------------------------------------------- + c Initialize colidx(n+1 .. 2n) to zero. + c Used by sprnvc to mark nonzero positions + c---------------------------------------------------------------------*/ + + for (i = 1; i <= n; i++) { + colidx[n+i] = 0; + } + for (iouter = 1; iouter <= n; iouter++) { + nzv = nonzer; + sprnvc(n, nzv, v, iv, &(colidx[0]), &(colidx[n])); + vecset(n, v, iv, &nzv, iouter, 0.5); + for (ivelt = 1; ivelt <= nzv; ivelt++){ + jcol = iv[ivelt]; + if (jcol >= firstcol && jcol <= lastcol) { + scale = size * v[ivelt]; + for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) { + irow = iv[ivelt1]; + if (irow >= firstrow && irow <= lastrow) { + nnza = nnza + 1; + if (nnza > nz) { + printf("Space for matrix elements exceeded in" " makea\n"); + printf("nnza, nzmax = %d, %d\n", nnza, nz); + printf("iouter = %d\n", iouter); + exit(1); + } + acol[nnza] = jcol; + arow[nnza] = irow; + aelt[nnza] = v[ivelt1] * scale; + } + } + } + } + size = size * ratio; + } + + /*--------------------------------------------------------------------- + c ... add the identity * rcond to the generated matrix to bound + c the smallest eigenvalue from below by rcond + c---------------------------------------------------------------------*/ + for (i = firstrow; i <= lastrow; i++) { + if (i >= firstcol && i <= lastcol) { + iouter = n + i; + nnza = nnza + 1; + if (nnza > nz) { + printf("Space for matrix elements exceeded in makea\n"); + printf("nnza, nzmax = %d, %d\n", nnza, nz); + printf("iouter = %d\n", iouter); + exit(1); + } + acol[nnza] = i; + arow[nnza] = i; + aelt[nnza] = rcond - shift; + } + } + + /*--------------------------------------------------------------------- + c ... make the sparse matrix from list of elements with duplicates + c (v and iv are used as workspace) + c---------------------------------------------------------------------*/ + sparse(a, colidx, rowstr, n, arow, acol, aelt, firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza); +} + +/*--------------------------------------------------- +c generate a sparse matrix from a list of +c [col, row, element] tri +c---------------------------------------------------*/ +static void sparse( + double a[], /* a[1:*] */ + int colidx[], /* colidx[1:*] */ + int rowstr[], /* rowstr[1:*] */ + int n, + int arow[], /* arow[1:*] */ + int acol[], /* acol[1:*] */ + double aelt[], /* aelt[1:*] */ + int firstrow, + int lastrow, + double x[], /* x[1:n] */ + boolean mark[], /* mark[1:n] */ + int nzloc[], /* nzloc[1:n] */ + int nnza) +/*--------------------------------------------------------------------- +c rows range from firstrow to lastrow +c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values +c---------------------------------------------------------------------*/ +{ + int nrows; + int i, j, jajp1, nza, k, nzrow; + double xi; + + /*-------------------------------------------------------------------- + c how many rows of result + c-------------------------------------------------------------------*/ + nrows = lastrow - firstrow + 1; + + /*-------------------------------------------------------------------- + c ...count the number of triples in each row + c-------------------------------------------------------------------*/ + + for (j = 1; j <= n; j++) { + rowstr[j] = 0; + mark[j] = FALSE; + } + rowstr[n+1] = 0; + + for (nza = 1; nza <= nnza; nza++) { + j = (arow[nza] - firstrow + 1) + 1; + rowstr[j] = rowstr[j] + 1; + } + rowstr[1] = 1; + for (j = 2; j <= nrows+1; j++) { + rowstr[j] = rowstr[j] + rowstr[j-1]; + } + + /*--------------------------------------------------------------------- + c ... rowstr(j) now is the location of the first nonzero + c of row j of a + c---------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c ... do a bucket sort of the triples on the row index + c-------------------------------------------------------------------*/ + for (nza = 1; nza <= nnza; nza++) { + j = arow[nza] - firstrow + 1; + k = rowstr[j]; + a[k] = aelt[nza]; + colidx[k] = acol[nza]; + rowstr[j] = rowstr[j] + 1; + } + + /*-------------------------------------------------------------------- + c ... rowstr(j) now points to the first element of row j+1 + c-------------------------------------------------------------------*/ + for (j = nrows; j >= 1; j--) { + rowstr[j+1] = rowstr[j]; + } + rowstr[1] = 1; + + /*-------------------------------------------------------------------- + c ... generate the actual output rows by adding elements + c-------------------------------------------------------------------*/ + nza = 0; + + for (i = 1; i <= n; i++) { + x[i] = 0.0; + mark[i] = FALSE; + } + + jajp1 = rowstr[1]; + for (j = 1; j <= nrows; j++) { + nzrow = 0; + + /*-------------------------------------------------------------------- + c ...loop over the jth row of a + c-------------------------------------------------------------------*/ + for (k = jajp1; k < rowstr[j+1]; k++) { + i = colidx[k]; + x[i] = x[i] + a[k]; + if ( mark[i] == FALSE && x[i] != 0.0) { + mark[i] = TRUE; + nzrow = nzrow + 1; + nzloc[nzrow] = i; + } + } + + /*-------------------------------------------------------------------- + c ... extract the nonzeros of this row + c-------------------------------------------------------------------*/ + for (k = 1; k <= nzrow; k++) { + i = nzloc[k]; + mark[i] = FALSE; + xi = x[i]; + x[i] = 0.0; + if (xi != 0.0) { + nza = nza + 1; + a[nza] = xi; + colidx[nza] = i; + } + } + jajp1 = rowstr[j+1]; + rowstr[j+1] = nza + rowstr[1]; + } +} + +/*--------------------------------------------------------------------- +c generate a sparse n-vector (v, iv) +c having nzv nonzeros +c +c mark(i) is set to 1 if position i is nonzero. +c mark is all zero on entry and is reset to all zero before exit +c this corrects a performance bug found by John G. Lewis, caused by +c reinitialization of mark on every one of the n calls to sprnvc +---------------------------------------------------------------------*/ +static void sprnvc( + int n, + int nz, + double v[], /* v[1:*] */ + int iv[], /* iv[1:*] */ + int nzloc[], /* nzloc[1:n] */ + int mark[] ) /* mark[1:n] */ +{ + int nn1; + int nzrow, nzv, ii, i; + double vecelt, vecloc; + + nzv = 0; + nzrow = 0; + nn1 = 1; + do { + nn1 = 2 * nn1; + } while (nn1 < n); + + /*-------------------------------------------------------------------- + c nn1 is the smallest power of two not less than n + c-------------------------------------------------------------------*/ + + while (nzv < nz) { + vecelt = randlc(&tran, amult); + + /*-------------------------------------------------------------------- + c generate an integer between 1 and n in a portable manner + c-------------------------------------------------------------------*/ + vecloc = randlc(&tran, amult); + i = icnvrt(vecloc, nn1) + 1; + if (i > n) continue; + + /*-------------------------------------------------------------------- + c was this integer generated already? + c-------------------------------------------------------------------*/ + if (mark[i] == 0) { + mark[i] = 1; + nzrow = nzrow + 1; + nzloc[nzrow] = i; + nzv = nzv + 1; + v[nzv] = vecelt; + iv[nzv] = i; + } + } + + for (ii = 1; ii <= nzrow; ii++) { + i = nzloc[ii]; + mark[i] = 0; + } +} + +/*--------------------------------------------------------------------- +* scale a double precision number x in (0,1) by a power of 2 and chop it +*---------------------------------------------------------------------*/ +static int icnvrt(double x, int ipwr2) { + return ((int)(ipwr2 * x)); +} + +/*-------------------------------------------------------------------- +c set ith element of sparse vector (v, iv) with +c nzv nonzeros to val +c-------------------------------------------------------------------*/ +static void vecset( + int n, + double v[], /* v[1:*] */ + int iv[], /* iv[1:*] */ + int *nzv, + int i, + double val) +{ + int k; + boolean set; + + set = FALSE; + for (k = 1; k <= *nzv; k++) { + if (iv[k] == i) { + v[k] = val; + set = TRUE; + } + } + if (set == FALSE) { + *nzv = *nzv + 1; + v[*nzv] = val; + iv[*nzv] = i; + } +} diff --git a/NPB/EP/Makefile b/NPB/EP/Makefile new file mode 100644 index 0000000..c84337e --- /dev/null +++ b/NPB/EP/Makefile @@ -0,0 +1,21 @@ +SHELL=/bin/sh +BENCHMARK=ep +BENCHMARKU=EP + +include ../config/make.def + +OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ + ${COMMON}/c_timers.o ${COMMON}/c_wtime.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} + + +ep.o: ep.cpp npbparams.hpp + ${CCOMPILE} ep.cpp + +clean: + - rm -f *.o *~ + - rm -f npbparams.hpp core diff --git a/NPB/EP/ep.cpp b/NPB/EP/ep.cpp new file mode 100644 index 0000000..a1ba2a2 --- /dev/null +++ b/NPB/EP/ep.cpp @@ -0,0 +1,289 @@ +/*-------------------------------------------------------------------- + + Information on NAS Parallel Benchmarks is available at: + + http://www.nas.nasa.gov/Software/NPB/ + + Authors: P. O. Frederickson + D. H. Bailey + A. C. Woo + + STL version: + Nicco Mietzsch + + CPP and OpenMP version: + Dalvan Griebler + Júnior Löff + +--------------------------------------------------------------------*/ +#include + +#include "npbparams.hpp" +#include +#include <../common/npb-CPP.hpp> + + +/* parameters */ +#define MK 16 +#define MM (M - MK) +#define NN (1 << MM) +#define NK (1 << MK) +#define NQ 10 +#define EPSILON 1.0e-8 +#define A 1220703125.0 +#define S 271828183.0 +#define TIMERS_ENABLED FALSE + +struct ep_res { + double sx; + double sy; + double q[NQ]; +}; + +/*-------------------------------------------------------------------- + program EMBAR +c-------------------------------------------------------------------*/ +/* +c This is the serial version of the APP Benchmark 1, +c the "embarassingly parallel" benchmark. +c +c M is the Log_2 of the number of complex pairs of uniform (0, 1) random +c numbers. MK is the Log_2 of the size of each batch of uniform random +c numbers. MK can be set for convenience on a given system, since it does +c not affect the results. +*/ +int main(int argc, char **argv) { + + dash::init(&argc, &argv); + + double Mops, t1, sx, sy, tm, an, gc; + double dum[3] = { 1.0, 1.0, 1.0 }; + int np, i, nit, k_offset, j; + boolean verified; + + int nthreads = dash::size(); + + if ( 0 == dash::myid() ) { + + char size[13+1]; /* character*13 */ + /* + c Because the size of the problem is too large to store in a 32-bit + c integer for some classes, we put it into a string (for printing). + c Have to strip off the decimal point put in there by the floating + c point print statement (internal file) + */ + + printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version"" - EP Benchmark\n"); + printf("\n\n Developed by: Dalvan Griebler \n"); + printf("\n\n DASH version by: Nicco Mietzsch \n"); + sprintf(size, "%12.0f", pow(2.0, M+1)); + for (j = 13; j >= 1; j--) { + if ( size[j] == '.' ) size[j] = ' '; + } + printf(" Number of random numbers generated: %13s\n", size); + + verified = FALSE; + } + /* + c Compute the number of "batches" of random number pairs generated + c per processor. Adjust if the number of processors does not evenly + c divide the total number + */ + np = NN; + + /* + c Call the random number generator functions and initialize + c the x-array to reduce the effects of paging on the timings. + c Also, call all mathematical functions that are used. Make + c sure these initializations cannot be eliminated as dead code. + */ + vranlc(0, &(dum[0]), dum[1], &(dum[2])); + dum[0] = randlc(&(dum[1]), dum[2]); + + double my_x[2*NK+1]; + for (i = 0; i < 2*NK+1; i++) my_x[i] = -1.0e99; + Mops = log(sqrt(fabs(max(1.0, 1.0)))); + + if ( 0 == dash::myid() ) { + + timer_clear(1); + timer_clear(2); + timer_clear(3); + timer_clear(4); + + timer_start(1); + } + + vranlc(0, &t1, A, my_x); + + /* Compute AN = A ^ (2 * NK) (mod 2^46). */ + + t1 = A; + + for (i = 1; i <= MK+1; i++) { + an = randlc(&t1, t1); + } + + dash::Array res(nthreads); + + an = t1; + gc = 0.0; + res.local[0].sx = 0.0; + res.local[0].sy = 0.0; + + for ( i = 0; i <= NQ - 1; i++) { + res.local[0].q[i] = 0.0; + } + + /* + c Each instance of this loop may be performed independently. We compute + c the k offsets separately to take into account the fact that some nodes + c have more numbers to generate than others + */ + if ( TIMERS_ENABLED == TRUE ) timer_start(4); + + dash::Array v(np); + + //if( 0 == dash::myid()) std::iota(v.begin(), v.end(), 0); + + std::iota(v.lbegin(), v.lend(), dash::myid() * ceil((double) np / dash::size())); //only works when using blocking pattern + + v.barrier(); + + dash::for_each(v.begin(), v.end(), [&an, &my_x, &res](int k) { + + double t1, t2, t3, t4, x1, x2; + int kk, i, ik, l; + + kk = k; + t1 = S; + t2 = an; + + // Find starting seed t1 for this kk. + + for (i = 1; i <= 100; i++) { + ik = kk / 2; + if (2 * ik != kk) t3 = randlc(&t1, t2); + if (ik == 0) break; + t3 = randlc(&t2, t2); + kk = ik; + } + + // Compute uniform pseudorandom numbers. + + //if (TIMERS_ENABLED == TRUE) timer_start(3); + vranlc(2*NK, &t1, A, my_x); + //if (TIMERS_ENABLED == TRUE) timer_stop(3); + + // + //c Compute Gaussian deviates by acceptance-rejection method and + //c tally counts in concentric square annuli. This loop is not + //c vectorizable. + // + //if (TIMERS_ENABLED == TRUE) timer_start(2); + + for (i = 1; i <= NK; i++) { + + x1 = 2.0 * my_x[2*i-1] - 1.0; + x2 = 2.0 * my_x[2*i] - 1.0; + t1 = pow2(x1) + pow2(x2); + if ( t1 <= 1.0 ) { + t2 = sqrt(-2.0 * log(t1) / t1); + t3 = (x1 * t2); // Xi + t4 = (x2 * t2); // Yi + l = max(fabs(t3), fabs(t4)); + res.local[0].q[l] += 1.0; // counts + res.local[0].sx = res.local[0].sx + t3; // sum of Xi + res.local[0].sy = res.local[0].sy + t4; // sum of Yi + } + } + //if (TIMERS_ENABLED == TRUE) timer_stop(2); + }); + + v.barrier(); + + if ( TIMERS_ENABLED == TRUE ) timer_stop(4); + + ep_res zero; + + res[0] = dash::reduce(res.begin(), res.end(), zero, [](ep_res a, ep_res b){ + ep_res c; + c.sx = a.sx + b.sx; + c.sy = a.sy + b.sy; + for (int i = 0; i < NQ; ++i) c.q[i] = a.q[i] + b.q[i]; + return c; + }); + + res.barrier(); + + if ( 0 == dash::myid() ) { + + for (i = 0; i <= NQ-1; i++) { + gc = gc + res.local[0].q[i]; + } + + sx = res.local[0].sx; + sy = res.local[0].sy; + + timer_stop(1); + tm = timer_read(1); + + + + nit = 0; + if ( M == 24 ) { + if ( (fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 25 ) { + if ( (fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 28 ) { + //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { + if ( (fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 30 ) { + if ( (fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 32 ) { + if ( (fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 36 ) { + if ( (fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON) ) { + verified = TRUE; + } + } else if ( M == 40 ) { + if ( (fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON) ) { + verified = TRUE; + } + } + + Mops = pow(2.0, M+1)/tm/1000000.0; + + printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" + "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); + for (i = 0; i <= NQ-1; i++) { + printf("%3d %15.0f\n", i, res.local[0].q[i]); + } + + c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, (char*)"Random numbers generated", + verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); + + if ( TIMERS_ENABLED == TRUE ) { + printf("Total time: %f\n", timer_read(1)); + printf("Gaussian pairs: %f\n", timer_read(2)); + printf("Random numbers: %f\n", timer_read(3)); + } + + } + + + + dash::finalize(); + + return 0; +} diff --git a/NPB/FT/Makefile b/NPB/FT/Makefile new file mode 100644 index 0000000..7e1e4a8 --- /dev/null +++ b/NPB/FT/Makefile @@ -0,0 +1,20 @@ +SHELL=/bin/sh +BENCHMARK=ft +BENCHMARKU=FT + +include ../config/make.def + +OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \ + ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} + +ft.o: ft.cpp global.hpp npbparams.hpp + ${CCOMPILE} ft.cpp + +clean: + - rm -f *.o *~ mputil* + - rm -f ft npbparams.hpp core diff --git a/NPB/FT/ft.cpp b/NPB/FT/ft.cpp new file mode 100644 index 0000000..0574755 --- /dev/null +++ b/NPB/FT/ft.cpp @@ -0,0 +1,1160 @@ +/*-------------------------------------------------------------------- + + Information on NAS Parallel Benchmarks is available at: + + http://www.nas.nasa.gov/Software/NPB/ + + Authors: D. Bailey + W. Saphir + + STL version: + Nicco Mietzsch + + CPP and OpenMP version: + Dalvan Griebler + Júnior Löff + +--------------------------------------------------------------------*/ + +#include + +#include +#include "npb-CPP.hpp" + +/* global variables */ +#include "global.hpp" + +/* function declarations */ +static void evolve(dash::NArray &u0, dash::NArray &u1, int t, dash::NArray &indexmap); +static void compute_initial_conditions(dash::NArray &u0); +static void ipow46(double a, int exponent, double *result); +static void setup(void); +static void compute_indexmap(dash::NArray &indexmap); +static void print_timers(void); +static void fft(int dir, dash::NArray &x1, dash::NArray &x2); +static void cffts1(int is, int d[3], dash::NArray &x, dash::NArray &xout); +static void cffts2(int is, int d[3], dash::NArray &x, dash::NArray &xout); +static void cffts3(int is, int d[3], dash::NArray &x, dash::NArray &xout); +static void fft_init (int n); +static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]); +static void fftz2 (int is, int l, int m, int n, int ny, int ny1, dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]); +static int ilog2(int n); +static void checksum(int i, dash::NArray &u1); +static void verify (int d1, int d2, int d3, int nt, boolean *verified, char *class_npb); + +/*-------------------------------------------------------------------- +c FT benchmark +c-------------------------------------------------------------------*/ + +int main(int argc, char **argv) { + + + dash::init(&argc, &argv); + /*c------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int i; + + /*------------------------------------------------------------------ + c u0, u1, u2 are the main arrays in the problem. + c Depending on the decomposition, these arrays will have different + c dimensions. To accomodate all possibilities, we allocate them as + c one-dimensional arrays and pass them to subroutines for different + c views + c - u0 contains the initial (transformed) initial condition + c - u1 and u2 are working arrays + c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the + c time evolution operator. + c-----------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c Large arrays are in common so that they are allocated on the + c heap rather than the stack. This common block is not + c referenced directly anywhere else. Padding is to avoid accidental + c cache problems, since all array sizes are powers of two. + c-------------------------------------------------------------------*/ + auto distspec = dash::DistributionSpec<3>( dash::BLOCKED, dash::NONE , dash::NONE); + + static dash::NArray u0(NZ, NY, NX, distspec); + /*static dcomplex pad1[3];*/ + static dash::NArray u1(NZ, NY, NX, distspec); + /*static dcomplex pad2[3];*/ + static dash::NArray u2(NZ, NY, NX, distspec); + /*static dcomplex pad3[3];*/ + static dash::NArray indexmap(NZ, NY, NX, distspec); + + int nthreads = dash::size(); + + int iter; + double total_time, mflops; + boolean verified; + char class_npb; + + /*-------------------------------------------------------------------- + c Run the entire problem once to make sure all data is touched. + c This reduces variable startup costs, which is important for such a + c short benchmark. The other NPB 2 implementations are similar. + c-------------------------------------------------------------------*/ + if ( 0 == dash::myid() ) { + for (i = 0; i < T_MAX; i++) { + timer_clear(i); + } + } + setup(); + + compute_indexmap(indexmap); + compute_initial_conditions(u1); + fft_init (dims[0][0]); + dash::barrier(); + + fft(1, u1, u0); + dash::barrier(); + + /*-------------------------------------------------------------------- + c Start over from the beginning. Note that all operations must + c be timed, in contrast to other benchmarks. + c-------------------------------------------------------------------*/ + if ( 0 == dash::myid() ) { + for (i = 0; i < T_MAX; i++) { + timer_clear(i); + } + + //std::clear(); + + timer_start(T_TOTAL); + if ( TIMERS_ENABLED == TRUE ) timer_start(T_SETUP); + } + + compute_indexmap(indexmap); + + compute_initial_conditions(u1); + + fft_init (dims[0][0]); + dash::barrier(); + + if ( 0 == dash::myid() ) { + if ( TIMERS_ENABLED == TRUE ) { + timer_stop(T_SETUP); + } + if ( TIMERS_ENABLED == TRUE ) { + timer_start(T_FFT); + } + } + + fft(1, u1, u0); + dash::barrier(); + + if ( 0 == dash::myid() ) { + if (TIMERS_ENABLED == TRUE) { + timer_stop(T_FFT); + } + } + + for (iter = 1; iter <= niter; iter++) { + + if ( 0 == dash::myid() ) { + if ( TIMERS_ENABLED == TRUE ) { + timer_start(T_EVOLVE); + } + } + + evolve(u0, u1, iter, indexmap); + dash::barrier(); + + if ( 0 == dash::myid() ) { + if ( TIMERS_ENABLED == TRUE ) { + timer_stop(T_EVOLVE); + } + if ( TIMERS_ENABLED == TRUE ) { + timer_start(T_FFT); + } + } + + fft(-1, u1, u2); + dash::barrier(); + + if ( 0 == dash::myid() ) { + if ( TIMERS_ENABLED == TRUE ) { + timer_stop(T_FFT); + } + if ( TIMERS_ENABLED == TRUE ) { + timer_start(T_CHECKSUM); + } + } + + checksum(iter, u2); + + if ( 0 == dash::myid() ) { + if ( TIMERS_ENABLED == TRUE ) { + timer_stop(T_CHECKSUM); + } + } + } + +if ( 0 == dash::myid() ) { + verify(NX, NY, NZ, niter, &verified, &class_npb); + + timer_stop(T_TOTAL); + + total_time = timer_read(T_TOTAL); + + if ( total_time != 0.0 ) { + mflops = 1.0e-6*(double)(NTOTAL) * + (14.8157+7.19641*log((double)(NTOTAL)) + + (5.23518+7.21113*log((double)(NTOTAL)))*niter)/total_time; + } else { + mflops = 0.0; + } + c_print_results((char*)"FT", class_npb, NX, NY, NZ, niter, nthreads, total_time, mflops, (char*)" floating point", verified, + (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); + if ( TIMERS_ENABLED == TRUE ) print_timers(); + } + dash::finalize(); + + return 0; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void evolve(dash::NArray &u0, dash::NArray &u1, int t, dash::NArray &indexmap) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c evolve u0 -> u1 (t time steps) in fourier space + c-------------------------------------------------------------------*/ + + if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX); + + for (int k = 0; k < u1.local.extent(0); k++){ + for (int j = 0; j < dims[0][1]; j++) { + for (int i = 0; i < NX; i++) { + crmul(u1.local(k,j,i), u0.local(k,j,i), ex[t*indexmap.local(k,j,i)]); + } + } + } + if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void compute_initial_conditions(dash::NArray &u0) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c Fill in array u0 with initial conditions from + c random number generator + c-------------------------------------------------------------------*/ + + /*double x0, start, an, dummy;*/ + double start, an; + //double tmp[NX*2*MAXDIM+1]; + dash::Array starts(NZ*dash::size()); + + start = SEED; + /*-------------------------------------------------------------------- + c Jump to the starting element for our first plane. + c-------------------------------------------------------------------*/ + ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an); + /*dummy = */randlc(&start, an); + ipow46(A, 2*NX*NY, &an); + + starts.local[0] = start; + for (int i=1; i 1) { + n2 = n/2; + if ( n2 * 2 == n ) { + /*dummy = */randlc(&q, q); + n = n2; + } else { + /*dummy = */randlc(&r, q); + n = n-1; + } + } + /*dummy = */randlc(&r, q); + *result = r; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void setup(void) { + + int i; + niter = NITER_DEFAULT; + if ( 0 == dash::myid() ) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*int ierr, i, j, fstatus;*/ + //int i; + + printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version" " - FT Benchmark\n\n"); + printf("\n\n Developed by: Dalvan Griebler \n"); + printf("\n\n DASH version by: Nicco Mietzsch \n"); + + printf(" Size : %3dx%3dx%3d\n", NX, NY, NZ); + printf(" Iterations : %7d\n", niter); + + /* 1004 format(' Number of processes : ', i7) + 1005 format(' Processor array : ', i3, 'x', i3) + 1006 format(' WARNING: compiled for ', i5, ' processes. ', + > ' Will not verify. ')*/ + } + + for (i = 0;i < 3 ; i++) { + dims[i][0] = NX; + dims[i][1] = NY; + dims[i][2] = NZ; + } + + + for (i = 0; i < 3; i++) { + xstart[i] = 1; + xend[i] = NX; + ystart[i] = 1; + yend[i] = NY; + zstart[i] = 1; + zend[i] = NZ; + } + + /*-------------------------------------------------------------------- + c Set up info for blocking of ffts and transposes. This improves + c performance on cache-based systems. Blocking involves + c working on a chunk of the problem at a time, taking chunks + c along the first, second, or third dimension. + c + c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) + c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + c + c Since 1st dim is always in processor, we'll assume it's long enough + c (default blocking factor is 16 so min size for 1st dim is 16) + c The only case we have to worry about is cffts1 in a 2d decomposition. + c so the blocking factor should not be larger than the 2nd dimension. + c-------------------------------------------------------------------*/ + + fftblock = FFTBLOCK_DEFAULT; + fftblockpad = FFTBLOCKPAD_DEFAULT; + + if ( fftblock != FFTBLOCK_DEFAULT ) fftblockpad = fftblock+3; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void compute_indexmap(dash::NArray &indexmap) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 + c for time evolution exponent. + c-------------------------------------------------------------------*/ + + int i; + double ap; + + /*-------------------------------------------------------------------- + c basically we want to convert the fortran indices + c 1 2 3 4 5 6 7 8 + c to + c 0 1 2 3 -4 -3 -2 -1 + c The following magic formula does the trick: + c mod(i-1+n/2, n) - n/2 + c-------------------------------------------------------------------*/ + + if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX); + + int zplanes_per_unit = ceil((double) indexmap.extent(0) / dash::size()); + int myoffset = dash::myid()*zplanes_per_unit; + + for (int k = 0; k < indexmap.local.extent(0); k++) { + int kk = (k+myoffset+1+zstart[2]-2+NZ/2)%NZ - NZ/2; + int kk2 = kk*kk; + for (int j = 0; j < dims[2][1]; j++) { + int jj = (j+1+ystart[2]-2+NY/2)%NY - NY/2; + int jj2 = jj*jj; + for (int i = 0; i < dims[2][0]; i++) { + int ii = (i+1+xstart[2]-2+NX/2)%NX - NX/2; + indexmap.local(k,j,i) = kk2+jj2+ii*ii; + } + } + } + + if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX); + + /*-------------------------------------------------------------------- + c compute array of exponentials for time evolution. + c-------------------------------------------------------------------*/ + ap = - 4.0 * ALPHA * PI * PI; + + ex[0] = 1.0; + ex[1] = exp(ap); + for (i = 2; i <= EXPMAX; i++) { + ex[i] = ex[i-1]*ex[1]; + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void print_timers(void) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int i; + const char *tstrings[] = { + " total ", + " setup ", + " fft ", + " evolve ", + " checksum ", + " fftlow ", + " fftcopy "}; + + for (i = 0; i < T_MAX; i++) { + if ( timer_read(i) != 0.0 ) { + printf("timer %2d(%s) :%10.6f\n", i, tstrings[i], timer_read(i)); + } + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void fft(int dir, dash::NArray &x1, dash::NArray &x2) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c note: args x1, x2 must be different arrays + c note: args for cfftsx are (direction, layout, xin, xout, scratch) + c xin/xout may be the same and it can be somewhat faster + c if they are + c-------------------------------------------------------------------*/ + + if ( dir == 1 ) { + cffts1(1, dims[0], x1, x1);/* x1 -> x1 */ + cffts2(1, dims[1], x1, x1);/* x1 -> x1 */ + dash::barrier(); + cffts3(1, dims[2], x1, x2);/* x1 -> x2 */ + } else { + cffts3(-1, dims[2], x1, x1);/* x1 -> x1 */ + dash::barrier(); + cffts2(-1, dims[1], x1, x1);/* x1 -> x1 */ + cffts1(-1, dims[0], x1, x2);/* x1 -> x2 */ + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void cffts1(int is, int d[3], dash::NArray &x, dash::NArray &xout) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int logd[3]; + + for (int i = 0; i < 3; i++) { + logd[i] = ilog2(d[i]); + } + + if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX); + + for (int k = 0; k < x.local.extent(0); k++) { + dcomplex y0[NX][FFTBLOCKPAD]; + dcomplex y1[NX][FFTBLOCKPAD]; + + for (int jj = 0; jj <= d[1] - fftblock; jj+=fftblock) { + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int j = 0; j < fftblock; j++) { + for (int i = 0; i < d[0]; i++) { + y0[i][j].real = x.local(k,j+jj,i).real; + y0[i][j].imag = x.local(k,j+jj,i).imag; + } + } + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); + cfftz (is, logd[0], d[0], y0, y1); + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int j = 0; j < fftblock; j++) { + for (int i = 0; i < d[0]; i++) { + xout.local(k,j+jj,i).real = y0[i][j].real; + xout.local(k,j+jj,i).imag = y0[i][j].imag; + } + } + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + } + } + if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void cffts2(int is, int d[3], dash::NArray &x, dash::NArray &xout) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int logd[3]; + + for (int i = 0; i < 3; i++) { + logd[i] = ilog2(d[i]); + } + + if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX); + + for (int k = 0; k < x.local.extent(0); k++) { + dcomplex y0[NX][FFTBLOCKPAD]; + dcomplex y1[NX][FFTBLOCKPAD]; + + for (int ii = 0; ii <= d[0] - fftblock; ii+=fftblock) { + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int j = 0; j < d[1]; j++) { + for (int i = 0; i < fftblock; i++) { + y0[j][i].real = x.local(k,j,i+ii).real; + y0[j][i].imag = x.local(k,j,i+ii).imag; + } + // int offset = k*NX*NY+j*NX; + // std::copy(x.lbegin()+offset+ii, x.lbegin()+offset+ii+fftblock, &y0[j][0]); + } + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); + cfftz (is, logd[1], d[1], y0, y1); + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int j = 0; j < d[1]; j++) { + // for (int i = 0; i < fftblock; i++) { + // xout.local(k,j,i+ii).real = y0[j][i].real; + // xout.local(k,j,i+ii).imag = y0[j][i].imag; + // } + int offset = k*NX*NY+j*NX; + std::copy(&y0[j][0], &y0[j][fftblock], xout.lbegin()+offset+ii); + // + // The combination of reading in sequential and writing with std::copy + // seems to be the fastest with my setup. This definitely needs + // further investigation. + // + } + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + } + } + if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void cffts3(int is, int d[3], dash::NArray &x, dash::NArray &xout) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int logd[3]; + + for (int i = 0;i < 3; i++) { + logd[i] = ilog2(d[i]); + } + + if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX); + + int yplanes_per_unit = ceil((double) d[1] / dash::size()); //this is effectively a blocked pattern. It could be better to use cyclic for higher unit counts. + int myoffset = dash::myid()*yplanes_per_unit; + + for (int j = 0; j < yplanes_per_unit && myoffset+j < d[1]; j++) { + + dcomplex y0[NX][FFTBLOCKPAD]; + dcomplex y1[NX][FFTBLOCKPAD]; + + // dash::Future futs[d[2]]; + // typedef typename dash::Future, dash::GlobStaticMem, dash::GlobPtr >, dash::GlobRef > > async_write_type; + // async_write_type futs_w[d[2]]; + + // + // Asynchronous copy does not seem to yield anything, at best it is equally good. + // + + for (int ii = 0; ii <= d[0] - fftblock; ii+=fftblock) { + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int k = 0; k < d[2]; k++) { + // for (int i = 0; i < fftblock; i++) { + //y0[k][i] = x(k,myoffset+j,i+ii); + // } + int offset = k*NX*NY+(myoffset+j)*NX; + dash::copy(x.begin()+offset+ii, x.begin()+offset+ii+fftblock, &y0[k][0]); + // futs[k] = dash::copy_async(x.begin()+offset+ii, x.begin()+offset+ii+fftblock, &y0[k][0]); + } + + // for (int k = 0; k < d[2]; k++) futs[k].wait(); + + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); + cfftz (is, logd[2], d[2], y0, y1); + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); + // if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); + for (int k = 0; k < d[2]; k++) { + // for (int i = 0; i < fftblock; i++) { + // xout(k,myoffset+j,i+ii) = y0[k][i]; + // } + int offset = k*NX*NY+(myoffset+j)*NX; + dash::copy(&y0[k][0], &y0[k][fftblock], xout.begin()+offset+ii); + // futs_w[k] = dash::copy_async(&y0[k][0], &y0[k][fftblock], xout.begin()+offset+ii); + } + // for (int k = 0; k < d[2]; k++) futs_w[k].wait(); + // if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); + } + } + if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void fft_init (int n) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c compute the roots-of-unity array that will be used for subsequent FFTs. + c-------------------------------------------------------------------*/ + + /*int m,nu,ku,i,j,ln;*/ + int m,ku,i,j,ln; + double t, ti; + + + /*-------------------------------------------------------------------- + c Initialize the U array with sines and cosines in a manner that permits + c stride one access at each FFT iteration. + c-------------------------------------------------------------------*/ + /*nu = n;*/ + m = ilog2(n); + u[0].real = (double)m; + u[0].imag = 0.0; + ku = 1; + ln = 1; + + for (j = 1; j <= m; j++) { + t = PI / ln; + + for (i = 0; i <= ln - 1; i++) { + ti = i * t; + u[i+ku].real = cos(ti); + u[i+ku].imag = sin(ti); + } + + ku = ku + ln; + ln = 2 * ln; + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c Computes NY N-point complex-to-complex FFTs of X using an algorithm due + c to Swarztrauber. X is both the input and the output array, while Y is a + c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to + c perform FFTs, the array U must be initialized by calling CFFTZ with IS + c set to 0 and M set to MX, where MX is the maximum value of M for any + c subsequent call. + c-------------------------------------------------------------------*/ + + int i,j,l,mx; + + /*-------------------------------------------------------------------- + c Check if input parameters are invalid. + c-------------------------------------------------------------------*/ + mx = (int)(u[0].real); + if ( (is != 1 && is != -1) || m < 1 || m > mx ) { + printf("CFFTZ: Either U has not been initialized, or else\n" + "one of the input parameters is invalid%5d%5d%5d\n", + is, m, mx); + exit(1); + } + + /*-------------------------------------------------------------------- + c Perform one variant of the Stockham FFT. + c-------------------------------------------------------------------*/ + for (l = 1; l <= m; l+=2) { + fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y); + if ( l == m ) break; + fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x); + } + + /*-------------------------------------------------------------------- + c Copy Y to X. + c-------------------------------------------------------------------*/ + if ( m % 2 == 1 ) { + for (j = 0; j < n; j++) { + for (i = 0; i < fftblock; i++) { + x[j][i].real = y[j][i].real; + x[j][i].imag = y[j][i].imag; + } + } + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void fftz2 (int is, int l, int m, int n, int ny, int ny1, dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c Performs the L-th iteration of the second variant of the Stockham FFT. + c-------------------------------------------------------------------*/ + + int k,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22; + /*dcomplex u1,x11,x21;*/ + dcomplex u1; + + /*-------------------------------------------------------------------- + c Set initial parameters. + c-------------------------------------------------------------------*/ + + n1 = n / 2; + if ( l-1 == 0 ) { + lk = 1; + } else { + lk = 2 << ((l - 1)-1); + } + if ( m-l == 0 ) { + li = 1; + } else { + li = 2 << ((m - l)-1); + } + lj = 2 * lk; + ku = li; + + for (i = 0; i < li; i++) { + + i11 = i * lk; + i12 = i11 + n1; + i21 = i * lj; + i22 = i21 + lk; + if ( is >= 1 ) { + u1.real = u[ku+i].real; + u1.imag = u[ku+i].imag; + } else { + u1.real = u[ku+i].real; + u1.imag = -u[ku+i].imag; + } + + /*-------------------------------------------------------------------- + c This loop is vectorizable. + c-------------------------------------------------------------------*/ + //but it creates more overhead + + for (k = 0; k < lk; k++) { + for (j = 0; j < ny; j++) { + double x11real, x11imag; + double x21real, x21imag; + x11real = x[i11+k][j].real; + x11imag = x[i11+k][j].imag; + x21real = x[i12+k][j].real; + x21imag = x[i12+k][j].imag; + y[i21+k][j].real = x11real + x21real; + y[i21+k][j].imag = x11imag + x21imag; + y[i22+k][j].real = u1.real * (x11real - x21real) + - u1.imag * (x11imag - x21imag); + y[i22+k][j].imag = u1.real * (x11imag - x21imag) + + u1.imag * (x11real - x21real); + } + } + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static int ilog2(int n) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int nn, lg; + + if ( n == 1 ) { + return 0; + } + lg = 1; + nn = 2; + while (nn < n) { + nn = nn << 1; + lg++; + } + + return lg; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void checksum(int i, dash::NArray &u1) { +if ( 0 == dash::myid() ) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + /*int j, q,r,s, ierr;*/ + int j, q,r,s; + /*dcomplex chk,allchk;*/ + dcomplex chk; + + chk.real = 0.0; + chk.imag = 0.0; + + for (j = 1; j <= 1024; j++) { + q = j%NX+1; + if ( q >= xstart[0] && q <= xend[0] ) { + r = (3*j)%NY+1; + if ( r >= ystart[0] && r <= yend[0] ) { + s = (5*j)%NZ+1; + if ( s >= zstart[0] && s <= zend[0] ) { + dcomplex v1 = u1(s-zstart[0],r-ystart[0],q-xstart[0]); + cadd(chk,chk,v1); + } + } + } + } + + { + sums[i].real += chk.real; + sums[i].imag += chk.imag; + } + + + { + /* complex % real */ + sums[i].real = sums[i].real/(double)(NTOTAL); + sums[i].imag = sums[i].imag/(double)(NTOTAL); + + printf("T = %5d Checksum = %22.12e %22.12e\n", + i, sums[i].real, sums[i].imag); + } + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void verify (int d1, int d2, int d3, int nt, boolean *verified, char *class_npb) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*int ierr, size, i;*/ + int i; + double err, epsilon; + + /*-------------------------------------------------------------------- + c Sample size reference checksums + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c class_npb S size reference checksums + c-------------------------------------------------------------------*/ + double vdata_real_s[6+1] = { 0.0, + 5.546087004964e+02, + 5.546385409189e+02, + 5.546148406171e+02, + 5.545423607415e+02, + 5.544255039624e+02, + 5.542683411902e+02 }; + double vdata_imag_s[6+1] = { 0.0, + 4.845363331978e+02, + 4.865304269511e+02, + 4.883910722336e+02, + 4.901273169046e+02, + 4.917475857993e+02, + 4.932597244941e+02 }; + /*-------------------------------------------------------------------- + c class_npb W size reference checksums + c-------------------------------------------------------------------*/ + double vdata_real_w[6+1] = { 0.0, + 5.673612178944e+02, + 5.631436885271e+02, + 5.594024089970e+02, + 5.560698047020e+02, + 5.530898991250e+02, + 5.504159734538e+02 }; + double vdata_imag_w[6+1] = { 0.0, + 5.293246849175e+02, + 5.282149986629e+02, + 5.270996558037e+02, + 5.260027904925e+02, + 5.249400845633e+02, + 5.239212247086e+02 }; + /*-------------------------------------------------------------------- + c class_npb A size reference checksums + c-------------------------------------------------------------------*/ + double vdata_real_a[6+1] = { 0.0, + 5.046735008193e+02, + 5.059412319734e+02, + 5.069376896287e+02, + 5.077892868474e+02, + 5.085233095391e+02, + 5.091487099959e+02 }; + double vdata_imag_a[6+1] = { 0.0, + 5.114047905510e+02, + 5.098809666433e+02, + 5.098144042213e+02, + 5.101336130759e+02, + 5.104914655194e+02, + 5.107917842803e+02 }; + /*-------------------------------------------------------------------- + c class_npb B size reference checksums + c-------------------------------------------------------------------*/ + double vdata_real_b[20+1] = { 0.0, + 5.177643571579e+02, + 5.154521291263e+02, + 5.146409228649e+02, + 5.142378756213e+02, + 5.139626667737e+02, + 5.137423460082e+02, + 5.135547056878e+02, + 5.133910925466e+02, + 5.132470705390e+02, + 5.131197729984e+02, + 5.130070319283e+02, + 5.129070537032e+02, + 5.128182883502e+02, + 5.127393733383e+02, + 5.126691062020e+02, + 5.126064276004e+02, + 5.125504076570e+02, + 5.125002331720e+02, + 5.124551951846e+02, + 5.124146770029e+02 }; + double vdata_imag_b[20+1] = { 0.0, + 5.077803458597e+02, + 5.088249431599e+02, + 5.096208912659e+02, + 5.101023387619e+02, + 5.103976610617e+02, + 5.105948019802e+02, + 5.107404165783e+02, + 5.108576573661e+02, + 5.109577278523e+02, + 5.110460304483e+02, + 5.111252433800e+02, + 5.111968077718e+02, + 5.112616233064e+02, + 5.113203605551e+02, + 5.113735928093e+02, + 5.114218460548e+02, + 5.114656139760e+02, + 5.115053595966e+02, + 5.115415130407e+02, + 5.115744692211e+02 }; + /*-------------------------------------------------------------------- + c class_npb C size reference checksums + c-------------------------------------------------------------------*/ + double vdata_real_c[20+1] = { 0.0, + 5.195078707457e+02, + 5.155422171134e+02, + 5.144678022222e+02, + 5.140150594328e+02, + 5.137550426810e+02, + 5.135811056728e+02, + 5.134569343165e+02, + 5.133651975661e+02, + 5.132955192805e+02, + 5.132410471738e+02, + 5.131971141679e+02, + 5.131605205716e+02, + 5.131290734194e+02, + 5.131012720314e+02, + 5.130760908195e+02, + 5.130528295923e+02, + 5.130310107773e+02, + 5.130103090133e+02, + 5.129905029333e+02, + 5.129714421109e+02 }; + double vdata_imag_c[20+1] = { 0.0, + 5.149019699238e+02, + 5.127578201997e+02, + 5.122251847514e+02, + 5.121090289018e+02, + 5.121143685824e+02, + 5.121496764568e+02, + 5.121870921893e+02, + 5.122193250322e+02, + 5.122454735794e+02, + 5.122663649603e+02, + 5.122830879827e+02, + 5.122965869718e+02, + 5.123075927445e+02, + 5.123166486553e+02, + 5.123241541685e+02, + 5.123304037599e+02, + 5.123356167976e+02, + 5.123399592211e+02, + 5.123435588985e+02, + 5.123465164008e+02 }; + + epsilon = 1.0e-12; + *verified = TRUE; + *class_npb = 'U'; + + if ( d1 == 64 && + d2 == 64 && + d3 == 64 && + nt == 6 ) { + *class_npb = 'S'; + for (i = 1; i <= nt; i++) { + err = (get_real(sums[i]) - vdata_real_s[i]) / vdata_real_s[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + err = (get_imag(sums[i]) - vdata_imag_s[i]) / vdata_imag_s[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + } + } else if ( d1 == 128 && + d2 == 128 && + d3 == 32 && + nt == 6 ) { + *class_npb = 'W'; + for (i = 1; i <= nt; i++) { + err = (get_real(sums[i]) - vdata_real_w[i]) / vdata_real_w[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + err = ( get_imag(sums[i] ) - vdata_imag_w[i]) / vdata_imag_w[i]; + if (fabs(err) > epsilon) { + *verified = FALSE; + break; + } + } + } else if ( d1 == 256 && + d2 == 256 && + d3 == 128 && + nt == 6 ) { + *class_npb = 'A'; + for (i = 1; i <= nt; i++) { + err = (get_real(sums[i]) - vdata_real_a[i]) / vdata_real_a[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + err = (get_imag(sums[i]) - vdata_imag_a[i]) / vdata_imag_a[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + } + } else if ( d1 == 512 && d2 == 256 && d3 == 256 && nt == 20 ) { + *class_npb = 'B'; + for (i = 1; i <= nt; i++) { + err = (get_real(sums[i]) - vdata_real_b[i]) / vdata_real_b[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + err = (get_imag(sums[i]) - vdata_imag_b[i]) / vdata_imag_b[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + } + } else if ( d1 == 512 && + d2 == 512 && + d3 == 512 && + nt == 20 ) { + *class_npb = 'C'; + for (i = 1; i <= nt; i++) { + err = (get_real(sums[i]) - vdata_real_c[i]) / vdata_real_c[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + err = (get_imag(sums[i]) - vdata_imag_c[i]) / vdata_imag_c[i]; + if ( fabs(err) > epsilon ) { + *verified = FALSE; + break; + } + } + } + + if ( *class_npb != 'U' ) { + printf("Result verification successful\n"); + } else { + printf("Result verification failed\n"); + } + printf("class_npb = %1c\n", *class_npb); +} diff --git a/NPB/FT/global.hpp b/NPB/FT/global.hpp new file mode 100644 index 0000000..a47501d --- /dev/null +++ b/NPB/FT/global.hpp @@ -0,0 +1,111 @@ +#include "npbparams.hpp" + + +/* +c If processor array is 1x1 -> 0D grid decomposition + + +c Cache blocking params. These values are good for most +c RISC processors. +c FFT parameters: +c fftblock controls how many ffts are done at a time. +c The default is appropriate for most cache-based machines +c On vector machines, the FFT can be vectorized with vector +c length equal to the block size, so the block size should +c be as large as possible. This is the size of the smallest +c dimension of the problem: 128 for class A, 256 for class B and +c 512 for class C. +*/ + +#define FFTBLOCK_DEFAULT 64 +#define FFTBLOCKPAD_DEFAULT 64 //original: 18 + +#define FFTBLOCK FFTBLOCK_DEFAULT +#define FFTBLOCKPAD FFTBLOCKPAD_DEFAULT + +/* COMMON block: blockinfo */ +int fftblock; +int fftblockpad; + +/* +c we need a bunch of logic to keep track of how +c arrays are laid out. + + +c Note: this serial version is the derived from the parallel 0D case +c of the ft NPB. +c The computation proceeds logically as + +c set up initial conditions +c fftx(1) +c transpose (1->2) +c ffty(2) +c transpose (2->3) +c fftz(3) +c time evolution +c fftz(3) +c transpose (3->2) +c ffty(2) +c transpose (2->1) +c fftx(1) +c compute residual(1) + +c for the 0D, 1D, 2D strategies, the layouts look like xxx +c +c 0D 1D 2D +c 1: xyz xyz xyz +c 2: xyz xyz yxz +c 3: xyz zyx zxy + +c the array dimensions are stored in dims(coord, phase) +*/ + +/* COMMON block: layout */ +static int dims[3][3]; +static int xstart[3]; +static int ystart[3]; +static int zstart[3]; +static int xend[3]; +static int yend[3]; +static int zend[3]; + +#define T_TOTAL 0 +#define T_SETUP 1 +#define T_FFT 2 +#define T_EVOLVE 3 +#define T_CHECKSUM 4 +#define T_FFTLOW 5 +#define T_FFTCOPY 6 +#define T_MAX 7 + +#define TIMERS_ENABLED FALSE + +/* other stuff */ + +#define SEED 314159265.0 +#define A 1220703125.0 +#define PI 3.141592653589793238 +#define ALPHA 1.0e-6 + +#define EXPMAX (NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4)) + +/* COMMON block: excomm */ +static double ex[EXPMAX+1]; /* ex(0:expmax) */ + +/* +c roots of unity array +c relies on x being largest dimension? +*/ + +/* COMMON block: ucomm */ +static dcomplex u[NX]; + +/* for checksum data */ + +/* COMMON block: sumcomm */ +static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */ + +/* number of iterations*/ + +/* COMMON block: iter */ +static int niter; diff --git a/NPB/IS/Makefile b/NPB/IS/Makefile new file mode 100644 index 0000000..40b2865 --- /dev/null +++ b/NPB/IS/Makefile @@ -0,0 +1,24 @@ +SHELL=/bin/sh +BENCHMARK=is +BENCHMARKU=IS + +include ../config/make.def + +include ../sys/make.common + +OBJS = is.o \ + ${COMMON}/c_print_results.o \ + ${COMMON}/c_timers.o \ + ${COMMON}/c_wtime.o + + +${PROGRAM}: config ${OBJS} + ${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} + +is.o: is.cpp npbparams.hpp + ${CCOMPILE} is.cpp + +clean: + - rm -f *.o *~ mputil* + - rm -f npbparams.hpp core + - if [ -d rii_files ]; then rm -r rii_files; fi diff --git a/NPB/IS/is.cpp b/NPB/IS/is.cpp new file mode 100644 index 0000000..f182724 --- /dev/null +++ b/NPB/IS/is.cpp @@ -0,0 +1,996 @@ +/*-------------------------------------------------------------------- + + Information on NAS Parallel Benchmarks is available at: + + http://www.nas.nasa.gov/Software/NPB/ + + Authors: M. Yarrow + H. Jin + + CPP and OpenMP version: + Dalvan Griebler + Júnior Löff + +--------------------------------------------------------------------*/ + +#include + +#include "npbparams.hpp" +#include +#include +#include + + +/*****************************************************************/ +/* For serial IS, buckets are not really req'd to solve NPB1 IS */ +/* spec, but their use on some machines improves performance, on */ +/* other machines the use of buckets compromises performance, */ +/* probably because it is extra computation which is not req'd. */ +/* (Note: Mechanism not understood, probably cache related) */ +/* Example: SP2-66MhzWN: 50% speedup with buckets */ +/* Example: SGI Indy5000: 50% slowdown with buckets */ +/* Example: SGI O2000: 400% slowdown with buckets (Wow!) */ +/*****************************************************************/ +/* To disable the use of buckets, comment out the following line */ +//#define USE_BUCKETS + +/* Uncomment below for cyclic schedule */ +/*#define SCHED_CYCLIC*/ + + +/******************/ +/* default values */ +/******************/ +#ifndef CLASS +#define CLASS 'S' +#endif + + +/*************/ +/* CLASS S */ +/*************/ +#if CLASS == 'S' +#define TOTAL_KEYS_LOG_2 16 +#define MAX_KEY_LOG_2 11 +#define NUM_BUCKETS_LOG_2 9 +#endif + + +/*************/ +/* CLASS W */ +/*************/ +#if CLASS == 'W' +#define TOTAL_KEYS_LOG_2 20 +#define MAX_KEY_LOG_2 16 +#define NUM_BUCKETS_LOG_2 10 +#endif + +/*************/ +/* CLASS A */ +/*************/ +#if CLASS == 'A' +#define TOTAL_KEYS_LOG_2 23 +#define MAX_KEY_LOG_2 19 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS B */ +/*************/ +#if CLASS == 'B' +#define TOTAL_KEYS_LOG_2 25 +#define MAX_KEY_LOG_2 21 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS C */ +/*************/ +#if CLASS == 'C' +#define TOTAL_KEYS_LOG_2 27 +#define MAX_KEY_LOG_2 23 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS D */ +/*************/ +#if CLASS == 'D' +#define TOTAL_KEYS_LOG_2 31 +#define MAX_KEY_LOG_2 27 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +#if CLASS == 'D' +#define TOTAL_KEYS (1L << TOTAL_KEYS_LOG_2) +#else +#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2) +#endif +#define MAX_KEY (1 << MAX_KEY_LOG_2) +#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2) +#define NUM_KEYS TOTAL_KEYS +#define SIZE_OF_BUFFERS NUM_KEYS + + +#define MAX_ITERATIONS 10 +#define TEST_ARRAY_SIZE 5 + + +/*************************************/ +/* Typedef: if necessary, change the */ +/* size of int here by changing the */ +/* int type to, say, long */ +/*************************************/ +#if CLASS == 'D' +typedef long INT_TYPE; +#else +typedef int INT_TYPE; +#endif + + +/********************/ +/* Some global info */ +/********************/ +int passed_verification; + + +/************************************/ +/* These are the three main arrays. */ +/* See SIZE_OF_BUFFERS def above */ +/************************************/ +dash::Array key_array; +dash::Array key_buff2; + +INT_TYPE partial_verify_vals[TEST_ARRAY_SIZE]; + +#ifdef USE_BUCKETS +dash::Array bucket_size; + +using pattern_t = dash::CSRPattern<1>; +using array_t = dash::Array; + +array_t key_buff1; +#else +dash::Array key_buff1; + +dash::Array work_buffers; +#endif + + +/**********************/ +/* Partial verif info */ +/**********************/ +INT_TYPE test_index_array[TEST_ARRAY_SIZE], + test_rank_array[TEST_ARRAY_SIZE], + + S_test_index_array[TEST_ARRAY_SIZE] = +{48427,17148,23627,62548,4431}, +S_test_rank_array[TEST_ARRAY_SIZE] = +{0,18,346,64917,65463}, + +W_test_index_array[TEST_ARRAY_SIZE] = +{357773,934767,875723,898999,404505}, +W_test_rank_array[TEST_ARRAY_SIZE] = +{1249,11698,1039987,1043896,1048018}, + +A_test_index_array[TEST_ARRAY_SIZE] = +{2112377,662041,5336171,3642833,4250760}, +A_test_rank_array[TEST_ARRAY_SIZE] = +{104,17523,123928,8288932,8388264}, + +B_test_index_array[TEST_ARRAY_SIZE] = +{41869,812306,5102857,18232239,26860214}, +B_test_rank_array[TEST_ARRAY_SIZE] = +{33422937,10244,59149,33135281,99}, + +C_test_index_array[TEST_ARRAY_SIZE] = +{44172927,72999161,74326391,129606274,21736814}, +C_test_rank_array[TEST_ARRAY_SIZE] = +{61147,882988,266290,133997595,133525895}, + +D_test_index_array[TEST_ARRAY_SIZE] = +{1317351170,995930646,1157283250,1503301535,1453734525}, +D_test_rank_array[TEST_ARRAY_SIZE] = +{1,36538729,1978098519,2145192618,2147425337}; + + +/***********************/ +/* function prototypes */ +/***********************/ +double randlc( double *X, double *A ); + +void full_verify( void ); + +/*void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *cc, + char *clink, + char *c_lib, + char *c_inc, + char *cflags, + char *clinkflags );*/ +void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, int nthreads, double t, + double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, + char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand); + +void timer_clear( int n ); +void timer_start( int n ); +void timer_stop( int n ); +double timer_read( int n ); + + +/* + * FUNCTION RANDLC (X, A) + * + * This routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers + * before repeating. The argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). The returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * This routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On Cray systems, + * double precision should be disabled. + * + * David H. Bailey October 26, 1990 + * + * IMPLICIT DOUBLE PRECISION (A-H, O-Z) + * SAVE KS, R23, R46, T23, T46 + * DATA KS/0/ + * + * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, + * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than + * by merely using the ** operator, in order to insure that the results are + * exact on all systems. This code assumes that 0.5D0 is represented exactly. + */ + +/*****************************************************************/ +/************* R A N D L C ************/ +/************* ************/ +/************* portable random number generator ************/ +/*****************************************************************/ + +static int KS=0; +static double R23, R46, T23, T46; + +double randlc( double *X, double *A ) { + + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) { + + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + + for (i=1; i<=46; i++) { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + + KS = 1; + } + + /* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + + /* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} + + + + +/*****************************************************************/ +/************ F I N D _ M Y _ S E E D ************/ +/************ ************/ +/************ returns parallel random number seq seed ************/ +/*****************************************************************/ + +/* + * Create a random number sequence of total length nn residing + * on np number of processors. Each processor will therefore have a + * subsequence of length nn/np. This routine returns that random + * number which is the first random number for the subsequence belonging + * to processor rank kn, and which is used as seed for proc kn ran # gen. + */ + +double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ + int np, /* np = num procs */ + long nn, /* total num of ran numbers, all procs */ + double s, /* Ran num seed, for ex.: 314159265.00 */ + double a ) /* Ran num gen mult, try 1220703125.00 */ +{ + + double t1,t2; + long mq,nq,kk,ik; + + if ( kn == 0 ) return s; + + mq = (nn/4 + np - 1) / np; + nq = mq * 4 * kn; /* number of rans to be skipped */ + + t1 = s; + t2 = a; + kk = nq; + while ( kk > 1 ) { + ik = kk / 2; + if ( 2 * ik == kk ) { + (void)randlc( &t2, &t2 ); + kk = ik; + } + else { + (void)randlc( &t1, &t2 ); + kk = kk - 1; + } + } + (void)randlc( &t1, &t2 ); + + return( t1 ); +} + + + +/*****************************************************************/ +/************* C R E A T E _ S E Q ************/ +/*****************************************************************/ + +void create_seq( double seed, double a ) { + + double x, s; + INT_TYPE i, k; + + INT_TYPE k1, k2; + double an = a; + int myid, num_procs; + INT_TYPE mq; + + myid = dash::myid(); + num_procs = dash::size(); + + mq = (NUM_KEYS + num_procs - 1) / num_procs; + k1 = mq * myid; + k2 = k1 + mq; + if ( k2 > NUM_KEYS ) k2 = NUM_KEYS; + + KS = 0; + s = find_my_seed( myid, num_procs, (long)4*NUM_KEYS, seed, an ); + + k = MAX_KEY/4; + + for (i=k1; i l_sizes; + dash::Array dummy(NUM_BUCKETS); + + dash::Array my_elem_count(dash::size()); + + my_elem_count.local[0] = dummy.lsize() * num_bucket_keys; + + dash::barrier(); + + for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) { + l_sizes.push_back(my_elem_count[unit_idx]); + } + + pattern_t pattern(l_sizes); + + key_buff1.allocate(pattern); + +#else + key_buff1.allocate(MAX_KEY, dash::BLOCKED); + + work_buffers.allocate(num_procs*MAX_KEY); +#endif /*USE_BUCKETS*/ +} + + + +/*****************************************************************/ +/************* F U L L _ V E R I F Y ************/ +/*****************************************************************/ + + +void full_verify( void ) { + + /* Now, finally, sort the keys: */ + + /* Copy keys into work array; keys in key_array will be reassigned. */ + + std::copy(key_array.lbegin(), key_array.lend(), key_buff2.lbegin()); + + key_array.barrier(); + + /* This is actual sorting. Each thread is responsible for + a subset of key values */ + INT_TYPE j = dash::size(); + j = (MAX_KEY + j - 1) / j; + INT_TYPE k1 = j * dash::myid(); + INT_TYPE k2 = k1 + j; + if ( k2 > MAX_KEY ) k2 = MAX_KEY; + + std::for_each(key_buff2.begin(), key_buff2.end(), [k1, k2](INT_TYPE i) { + if (i >= k1 && i < k2) { + key_array[--key_buff1[i]] = i; + } + }); + + dash::barrier(); + + /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ + + dash::Array faults(dash::size()); + dash::fill(faults.begin(), faults.end(), 0); + + dash::Array v2(NUM_KEYS-1); + + //std::iota(v2.lbegin(), v2.lend(), dash::myid() * ceil((double) NUM_KEYS / dash::size())); //only works when using blocking pattern + if ( 0 == dash::myid() ) std::iota(v2.begin(), v2.end(), 1); + + v2.barrier(); + + dash::for_each(v2.begin(), v2.end(), [&faults, &v2](INT_TYPE i) { + if ( key_array[i-1] > key_array[i] ){ + printf("Fault at %d of %d\n", i, (int) v2[NUM_KEYS-1]); + faults.local[0]++; + } + }); + + dash::barrier(); + + + if ( 0 == dash::myid() ) { + for (int i = 1; i < dash::size(); i++) faults[0] += faults[i]; + + if ( faults[0] != 0 ) + printf( "Full_verify: number of keys out of sort: %ld\n", (long) faults[0] ); + else + passed_verification++; + } +} + + + + +/*****************************************************************/ +/************* R A N K ****************/ +/*****************************************************************/ + + +void rank( int iteration ) { + + INT_TYPE i, k; + +#ifdef USE_BUCKETS + int shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2; + INT_TYPE num_bucket_keys = (1L << shift); +#endif + + key_array[iteration] = iteration; + key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration; + + + /* Determine where the partial verify test keys are, load into */ + /* top of array bucket_size */ + for (i=0; i bucket_ptrs(NUM_BUCKETS*num_procs); + + /* Initialize */ + for (i=0; i> shift]++; + }); + + /* Accumulative bucket sizes are the bucket pointers. + These are global sizes accumulated upon to each bucket */ + bucket_ptrs.local[0] = 0; + for (k=0; k< myid; k++) { + bucket_ptrs.local[0] += bucket_size[k*NUM_BUCKETS]; + } + + for (i=1; i< NUM_BUCKETS; i++) { + bucket_ptrs.local[i] = bucket_ptrs.local[i-1]; + for (k=0; k< myid; k++) + bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i]; + for (k=myid; k< num_procs; k++) + bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i-1]; + } + + //////////////////////////////////// + dash::barrier(); + typedef dash::CSRPattern<1> pattern_t; + typedef int index_t; + typedef typename pattern_t::size_type extent_t; + + std::vector local_sizes; + dash::Array my_elem_count(dash::size()); + int local_buckets = ceil((double) NUM_BUCKETS / dash::size()); + + my_elem_count.local[0] = ((myid == num_procs - 1)? NUM_KEYS : bucket_ptrs[(myid + 1)*local_buckets]) - bucket_ptrs[(myid + 0)*local_buckets]; + + dash::barrier(); + + for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) { + local_sizes.push_back(my_elem_count[unit_idx]); + } + + pattern_t pattern(local_sizes); + + dash::Array key_buff2l(pattern); + + ////////////////////////////////7 + dash::for_each(key_array.begin(), key_array.end(), [&shift, &bucket_ptrs, &key_buff2l](int k) { + key_buff2l[bucket_ptrs.local[k >> shift]++] = k; + }); + + if ( myid < num_procs-1 ) { + for (i=0; i< NUM_BUCKETS; i++) + for (k=myid+1; k< num_procs; k++) + bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i]; + } + + + /* Now, buckets are sorted. We only need to sort keys inside + each bucket, which can be done in parallel. Because the distribution + of the number of keys in the buckets is Gaussian, the use of + a dynamic schedule should improve load balance, thus, performance */ + + dash::Array v(NUM_BUCKETS); + + //if ( 0 == dash::myid()) std::iota(v.begin(), v.end(), 0); + + std::iota(v.lbegin(), v.lend(), dash::myid() * local_buckets); //only works when using blocking pattern + + v.barrier(); + + dash::for_each(v.begin(), v.end(), [&num_bucket_keys, &bucket_ptrs, &key_buff2l, &local_buckets, &myid](int i) { + + int key_buff1_offset = local_buckets * myid * num_bucket_keys; + /* Clear the work array section associated with each bucket */ + INT_TYPE j; + INT_TYPE k1 = i * num_bucket_keys - key_buff1_offset; + INT_TYPE k2 = k1 + num_bucket_keys; + for (j = k1; j < k2; j++) + key_buff1.local[j] = 0; + + /* Ranking of all keys occurs in this section: */ + int key_buff2_offset = (myid > 0)? bucket_ptrs.local[myid*local_buckets-1] : 0; + + /* In this section, the keys themselves are used as their + own indexes to determine how many of each there are: their + individual population */ + INT_TYPE m = (i > 0)? bucket_ptrs.local[i-1] : 0; + for (j = m - key_buff2_offset; j < bucket_ptrs.local[i] - key_buff2_offset; j++) + key_buff1.local[key_buff2l.local[j] - key_buff1_offset]++; /* Now they have individual key population */ + + /* To obtain ranks of each key, successively add the individual key + population, not forgetting to add m, the total of lesser keys, + to the first key population */ + key_buff1.local[k1] += m; + for (j = k1+1; j < k2; j++) + key_buff1.local[j] += key_buff1.local[j-1]; + }); + +#else /*USE_BUCKETS*/ + + std::fill(work_buffers.lbegin(), work_buffers.lend(), 0); + + // compute the histogram for the local keys + for (int i=0; i\n" ); + printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); + printf( " Iterations: %d\n", MAX_ITERATIONS ); + printf( " Number of available threads: %d\n", std::thread::hardware_concurrency() ); + printf( "\n" ); + + #ifdef USE_BUCKETS + printf(" This Version of the Benchmark DOES USE Buckets\n\n"); + #else + printf(" This Version of the Benchmark DOES NOT USE Buckets\n\n"); + #endif + + if ( timer_on ) timer_start( 1 ); + } + + alloc_key_buff(); + + /* Generate random number sequence and subsequent keys on all procs */ + create_seq( 314159265.00, /* Random number gen seed */ + 1220703125.00 ); /* Random number gen mult */ + + if ( 0 == dash::myid() ) { + if ( timer_on ) timer_stop( 1 ); + } + + + /* Do one interation for free (i.e., untimed) to guarantee initialization of + all data and code pages and respective tables */ + rank( 1 ); + + /* Start verification counter */ + passed_verification = 0; + + if ( 0 == dash::myid() ) if ( CLASS != 'S' ) printf( "\n iteration\n" ); + + /* Start timer */ + if ( 0 == dash::myid() ) timer_start( 0 ); + + /* This is the main iteration */ + for (iteration=1; iteration<=MAX_ITERATIONS; iteration++) { + + if ( 0 == dash::myid() ) if ( CLASS != 'S' ) printf( " %d\n", iteration ); + rank( iteration ); + dash::barrier(); + } + + if ( 0 == dash::myid() ) { + + /* End of timing, obtain maximum time of all processors */ + timer_stop( 0 ); + + timecounter = timer_read( 0 ); + + + /* This tests that keys are in sequence: sorting of last ranked key seq + occurs here, but is an untimed operation */ + if ( timer_on ) timer_start( 2 ); + } + + full_verify(); + + if ( 0 == dash::myid() ) { + if ( timer_on ) timer_stop( 2 ); + + if ( timer_on ) timer_stop( 3 ); + + /* The final printout */ + if ( passed_verification != 5*MAX_ITERATIONS + 1 ) + passed_verification = 0; + + /*c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) + /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, + CFLAGS, CLINKFLAGS );*/ + c_print_results( (char*)"IS", CLASS, TOTAL_KEYS, 0, 0, MAX_ITERATIONS, nthreads, timecounter, + ((double) (MAX_ITERATIONS*TOTAL_KEYS))/timecounter/1000000.0, (char*)"keys ranked", passed_verification, + (char*)NPBVERSION, (char*)COMPILETIME, (char*)CC, (char*)CLINK, (char*)C_LIB, (char*)C_INC, (char*)CFLAGS, (char*)CLINKFLAGS, (char*)"randlc"); + + /* Print additional timers */ + if ( timer_on ) { + double t_total, t_percent; + + t_total = timer_read( 3 ); + printf("\nAdditional timers -\n"); + printf(" Total execution: %8.3f\n", t_total); + if ( t_total == 0.0 ) t_total = 1.0; + timecounter = timer_read(1); + t_percent = timecounter/t_total * 100.; + printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); + timecounter = timer_read(0); + t_percent = timecounter/t_total * 100.; + printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); + timecounter = timer_read(2); + t_percent = timecounter/t_total * 100.; + printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); + } + + } + + dash::finalize(); + + return 0; +} +/**************************/ +/* E N D P R O G R A M */ +/**************************/ diff --git a/NPB/MG/Makefile b/NPB/MG/Makefile new file mode 100644 index 0000000..2bcf6b1 --- /dev/null +++ b/NPB/MG/Makefile @@ -0,0 +1,20 @@ +SHELL=/bin/sh +BENCHMARK=mg +BENCHMARKU=MG + +include ../config/make.def + +OBJS = mg.o ${COMMON}/c_print_results.o \ + ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} + +mg.o: mg.cpp npbparams.hpp + ${CCOMPILE} mg.cpp + +clean: + - rm -f *.o *~ + - rm -f npbparams.hpp core diff --git a/NPB/MG/globals.hpp b/NPB/MG/globals.hpp new file mode 100644 index 0000000..f1ea027 --- /dev/null +++ b/NPB/MG/globals.hpp @@ -0,0 +1,45 @@ +/*-------------------------------------------------------------------- +c Parameter lm (declared and set in "npbparams.h") is the log-base2 of +c the edge size max for the partition on a given node, so must be changed +c either to save space (if running a small case) or made bigger for larger +c cases, for example, 512^3. Thus lm=7 means that the largest dimension +c of a partition that can be solved on a node is 2^7 = 128. lm is set +c automatically in npbparams.h +c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. +c-------------------------------------------------------------------*/ + +#include "npbparams.hpp" + +/* parameters */ +/* actual dimension including ghost cells for communications */ +#define NM (2+(2<<(LM-1))) +/* size of rhs array */ +#define NV (2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1)))) +/* size of residual array */ +#define NR ((8*(NV+(NM*NM)+5*NM+7*LM))/7) +/* size of communication buffer */ +#define NM2 (2*NM*NM) +/* maximum number of levels */ +#define MAXLEVEL 11 + +#define TIMERS_ENABLED FALSE + +/*---------------------------------------------------------------------*/ +/* common /mg3/ */ +static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; +/* common /ClassType/ */ +static char class_npb; +/* common /my_debug/ */ +static int debug_vec[8]; +/* common /fap/ */ +/*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/ +static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; +static int lt, lb; + +/*c--------------------------------------------------------------------- +c Set at m=1024, can handle cases up to 1024^3 case +c---------------------------------------------------------------------*/ +#define M 1037 + +/* common /buffer/ */ +/*static double buff[4][NM2];*/ diff --git a/NPB/MG/mg.cpp b/NPB/MG/mg.cpp new file mode 100644 index 0000000..61f7569 --- /dev/null +++ b/NPB/MG/mg.cpp @@ -0,0 +1,1600 @@ +/*-------------------------------------------------------------------- + Information on NAS Parallel Benchmarks is available at: + http://www.nas.nasa.gov/Software/NPB/ + Authors: E. Barszcz + P. Frederickson + A. Woo + M. Yarrow + DASH version: + Nicco Mietzsch + CPP and OpenMP version: + Dalvan Griebler + Júnior Löff +--------------------------------------------------------------------*/ +#include + +#include +#include +#include "npb-CPP.hpp" + +#include "globals.hpp" + +/* parameters */ +#define T_BENCH 1 +#define T_INIT 2 +#define T_STL 3 + +/* global variables */ +/* common /grid/ */ +static int is1, is2, is3, ie1, ie2, ie3; + +/* functions prototypes */ +static void setup(int *n1, int *n2, int *n3, int lt); +static void mg3P(std::vector > &u, dash::NArray &v, std::vector > &r, double a[4], double c[4], int n1, int n2, int n3, int k); +static void psinv(dash::NArray &r, dash::NArray &u, int n1, int n2, int n3, double c[4], int k); +static void resid(dash::NArray &u, dash::NArray &v, dash::NArray &r, int n1, int n2, int n3, double a[4], int k); +static void rprj3(dash::NArray &r, int m1k, int m2k, int m3k, dash::NArray &s, int m1j, int m2j, int m3j, int k); +static void interp(dash::NArray &z, int mm1, int mm2, int mm3, dash::NArray &u, int n1, int n2, int n3, int k); +static void norm2u3(dash::NArray &r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz); +static void rep_nrm(dash::NArray &u, int n1, int n2, int n3, char *title, int kk); +static void comm3(dash::NArray &u, int n1, int n2, int n3); +static void zran3(dash::NArray &z, int n1, int n2, int n3, int nx, int ny, int k); +static void showall(dash::NArray &z, int n1, int n2, int n3); +static double power(double a, int n); +static void bubble(double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind); +static void zero3(dash::NArray &z, int n1, int n2, int n3); +/*static void nonzero(double ***z, int n1, int n2, int n3);*/ + +/*-------------------------------------------------------------------- + program mg +c-------------------------------------------------------------------*/ + +int main(int argc, char *argv[]) { + + dash::init(&argc, &argv); + /*------------------------------------------------------------------------- + c k is the current level. It is passed down through subroutine args + c and is NOT global. it is the current iteration + c------------------------------------------------------------------------*/ + + int k, it; + double t, tinit, mflops; + int nthreads = 1; + + /*------------------------------------------------------------------------- + c These arrays are in common because they are quite large + c and probably shouldn't be allocated on the stack. They + c are always passed as subroutine args. + c------------------------------------------------------------------------*/ + + auto distspec = dash::DistributionSpec<3>(dash::BLOCKED, dash::NONE , dash::NONE); + std::vector > u; + dash::NArray v; + std::vector > r; + + double a[4], c[4]; + + double rnm2, rnmu; + double epsilon = 1.0e-8; + int n1, n2, n3, nit; + double verify_value; + boolean verified; + + int i, j, l; + FILE *fp; + + timer_clear(T_BENCH); + timer_clear(T_INIT); + timer_clear(T_STL); + + nthreads = dash::size(); + + timer_start(T_INIT); + + /*---------------------------------------------------------------------- + c Read in and broadcast input data + c---------------------------------------------------------------------*/ + if ( dash::myid() == 0 ) { + printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version" " - MG Benchmark\n\n"); + printf("\n\n Developed by: Dalvan Griebler \n"); + printf("\n\n DASH version by: Nicco Mietzsch \n"); + } + + fp = fopen("mg.input", "r"); + if ( fp != NULL ) { + printf(" Reading from input file mg.input\n"); + if ( fscanf(fp, "%d", <) != 1 ){ + printf(" Error in reading elements\n"); + exit(1); + } + while (fgetc(fp) != '\n'); + if ( fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]) != 3 ){ + printf(" Error in reading elements\n"); + exit(1); + } + while (fgetc(fp) != '\n'); + if ( fscanf(fp, "%d", &nit) != 1 ){ + printf(" Error in reading elements\n"); + exit(1); + } + while (fgetc(fp) != '\n'); + for (i = 0; i <= 7; i++) { + if ( fscanf(fp, "%d", &debug_vec[i]) != 1 ){ + printf(" Error in reading elements\n"); + exit(1); + } + } + fclose(fp); + } else { + if ( dash::myid() == 0 ) printf(" No input file. Using compiled defaults\n"); + + lt = LT_DEFAULT; + nit = NIT_DEFAULT; + nx[lt] = NX_DEFAULT; + ny[lt] = NY_DEFAULT; + nz[lt] = NZ_DEFAULT; + + for (i = 0; i <= 7; i++) { + debug_vec[i] = DEBUG_DEFAULT; + } + } + + if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) { + class_npb = 'U'; + } else if ( nx[lt] == 32 && nit == 4 ) { + class_npb = 'S'; + } else if ( nx[lt] == 64 && nit == 40 ) { + class_npb = 'W'; + } else if ( nx[lt] == 256 && nit == 20 ) { + class_npb = 'B'; + } else if ( nx[lt] == 512 && nit == 20 ) { + class_npb = 'C'; + } else if ( nx[lt] == 256 && nit == 4 ) { + class_npb = 'A'; + } else { + class_npb = 'U'; + } + + /*-------------------------------------------------------------------- + c Use these for debug info: + c--------------------------------------------------------------------- + c debug_vec(0) = 1 !=> report all norms + c debug_vec(1) = 1 !=> some setup information + c debug_vec(1) = 2 !=> more setup information + c debug_vec(2) = k => at level k or below, show result of resid + c debug_vec(3) = k => at level k or below, show result of psinv + c debug_vec(4) = k => at level k or below, show result of rprj + c debug_vec(5) = k => at level k or below, show result of interp + c debug_vec(6) = 1 => (unused) + c debug_vec(7) = 1 => (unused) + c-------------------------------------------------------------------*/ + + a[0] = -8.0/3.0; + a[1] = 0.0; + a[2] = 1.0/6.0; + a[3] = 1.0/12.0; + + if ( class_npb == 'A' || class_npb == 'S' || class_npb =='W' ) { + /*-------------------------------------------------------------------- + c Coefficients for the S(a) smoother + c-------------------------------------------------------------------*/ + c[0] = -3.0/8.0; + c[1] = 1.0/32.0; + c[2] = -1.0/64.0; + c[3] = 0.0; + } else { + /*-------------------------------------------------------------------- + c Coefficients for the S(b) smoother + c-------------------------------------------------------------------*/ + c[0] = -3.0/17.0; + c[1] = 1.0/33.0; + c[2] = -1.0/61.0; + c[3] = 0.0; + } + + lb = 1; + + setup(&n1,&n2,&n3,lt); + + u.resize(lt+1); + for (l = lt; l >=1; l--) { + u[l].allocate(m3[l],m2[l],m1[l], distspec); + } + + v.allocate(m3[lt],m2[lt],m1[lt], distspec); + + r.resize(lt+1); + for (l = lt; l >=1; l--) { + r[l].allocate(m3[l],m2[l],m1[l], distspec); + } + + zero3(u[lt],n1,n2,n3); + zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); + + norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); + + if ( dash::myid() == 0 ) { + /*printf("\n norms of random v are\n"); + printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu); + printf(" about to evaluate resid, k= %d\n", lt);*/ + + printf(" Size: %3dx%3dx%3d (class_npb %1c)\n", nx[lt], ny[lt], nz[lt], class_npb); + printf(" Iterations: %3d\n", nit); + } + + resid(u[lt],v,r[lt],n1,n2,n3,a,lt); + norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); + + /*c--------------------------------------------------------------------- + c One iteration for startup + c---------------------------------------------------------------------*/ + mg3P(u,v,r,a,c,n1,n2,n3,lt); + resid(u[lt],v,r[lt],n1,n2,n3,a,lt); + + setup(&n1,&n2,&n3,lt); + + zero3(u[lt],n1,n2,n3); + zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); + + if ( dash::myid() == 0 ) { + timer_stop(T_INIT); + + timer_clear(T_STL); + + //std::clear(); + + timer_start(T_BENCH); + } + + resid(u[lt],v,r[lt],n1,n2,n3,a,lt); + norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); + + for (it = 1; it <= nit; it++) { + mg3P(u,v,r,a,c,n1,n2,n3,lt); + resid(u[lt],v,r[lt],n1,n2,n3,a,lt); + } + norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); + + if ( dash::myid() == 0 ) { + timer_stop(T_BENCH); + + t = timer_read(T_BENCH); + tinit = timer_read(T_INIT); + + verified = FALSE; + verify_value = 0.0; + + printf(" Initialization time: %15.3f seconds\n", tinit); + printf(" Benchmark completed\n"); + + if ( class_npb != 'U' ) { + if ( class_npb == 'S' ) { + verify_value = 0.530770700573e-04; + } else if ( class_npb == 'W' ) { + verify_value = 0.250391406439e-17; /* 40 iterations*/ + /* 0.183103168997d-044 iterations*/ + } else if ( class_npb == 'A' ) { + verify_value = 0.2433365309e-5; + } else if ( class_npb == 'B' ) { + verify_value = 0.180056440132e-5; + } else if ( class_npb == 'C' ) { + verify_value = 0.570674826298e-06; + } + + if ( fabs( rnm2 - verify_value ) <= epsilon ) { + verified = TRUE; + printf(" VERIFICATION SUCCESSFUL\n"); + printf(" L2 Norm is %20.12e\n", rnm2); + printf(" Error is %20.12e\n", rnm2 - verify_value); + } else { + verified = FALSE; + printf(" VERIFICATION FAILED\n"); + printf(" L2 Norm is %20.12e\n", rnm2); + printf(" The correct L2 Norm is %20.12e\n", verify_value); + } + } else { + verified = FALSE; + printf(" Problem size unknown\n"); + printf(" NO VERIFICATION PERFORMED\n"); + } + + if ( t != 0.0 ) { + int nn = nx[lt]*ny[lt]*nz[lt]; + mflops = 58.*nit*nn*1.0e-6 / t; + } else { + mflops = 0.0; + } + + c_print_results((char*)"MG", class_npb, nx[lt], ny[lt], nz[lt], nit, nthreads, t, mflops, (char*)" floating point", + verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); + if ( TIMERS_ENABLED == TRUE ) printf(" time spent in STL: %15.3f seconds\n", timer_read(T_STL)); + + } + + dash::finalize(); + + return 0; +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void setup(int *n1, int *n2, int *n3, int lt) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int k; + + for (k = lt-1; k >= 1; k--) { + nx[k] = nx[k+1]/2; + ny[k] = ny[k+1]/2; + nz[k] = nz[k+1]/2; + } + + for (k = 1; k <= lt; k++) { + m1[k] = nx[k]+2; + m2[k] = nz[k]+2; + m3[k] = ny[k]+2; + } + + is1 = 1; + ie1 = nx[lt]; + *n1 = nx[lt]+2; + is2 = 1; + ie2 = ny[lt]; + *n2 = ny[lt]+2; + is3 = 1; + ie3 = nz[lt]; + *n3 = nz[lt]+2; + + if ( debug_vec[1] >= 1 && dash::myid() == 0 ) { + printf(" in setup, \n"); + printf(" lt nx ny nz n1 n2 n3 is1 is2 is3 ie1 ie2 ie3\n"); + printf("%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d\n",lt,nx[lt],ny[lt],nz[lt],*n1,*n2,*n3,is1,is2,is3,ie1,ie2,ie3); + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void mg3P(std::vector > &u, dash::NArray &v, std::vector > &r, double a[4], double c[4], int n1, int n2, int n3, int k) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c multigrid V-cycle routine + c-------------------------------------------------------------------*/ + + int j; + + /*-------------------------------------------------------------------- + c down cycle. + c restrict the residual from the find grid to the coarse + c-------------------------------------------------------------------*/ + + for (k = lt; k >= lb+1; k--) { + j = k-1; + rprj3(r[k], m1[k], m2[k], m3[k], r[j], m1[j], m2[j], m3[j], k); + } + + k = lb; + /*-------------------------------------------------------------------- + c compute an approximate solution on the coarsest grid + c-------------------------------------------------------------------*/ + zero3(u[k], m1[k], m2[k], m3[k]); + psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k); + + for (k = lb+1; k <= lt-1; k++) { + j = k-1; + /*-------------------------------------------------------------------- + c prolongate from level k-1 to k + c-------------------------------------------------------------------*/ + zero3(u[k], m1[k], m2[k], m3[k]); + interp(u[j], m1[j], m2[j], m3[j], u[k], m1[k], m2[k], m3[k], k); + /*-------------------------------------------------------------------- + c compute residual for level k + c-------------------------------------------------------------------*/ + resid(u[k], r[k], r[k], m1[k], m2[k], m3[k], a, k); + /*-------------------------------------------------------------------- + c apply smoother + c-------------------------------------------------------------------*/ + psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k); + } + + j = lt - 1; + k = lt; + interp(u[j], m1[j], m2[j], m3[j], u[lt], n1, n2, n3, k); + resid(u[lt], v, r[lt], n1, n2, n3, a, k); + psinv(r[lt], u[lt], n1, n2, n3, c, k); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void psinv(dash::NArray &r, dash::NArray &u, int n1, int n2, int n3, double c[4], int k) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c psinv applies an approximate inverse as smoother: u = u + Cr + c + c This implementation costs 15A + 4M per result, where + c A and M denote the costs of Addition and Multiplication. + c Presuming coefficient c(3) is zero (the NPB assumes this, + c but it is thus not a general case), 2A + 1M may be eliminated, + c resulting in 13A + 3M. + c Note that this vectorizes, and is also fine for cache + c based machines. + c-------------------------------------------------------------------*/ + double r1[n1], r2[n1]; + + int myid = dash::myid(); + + auto pattern = u.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + int topcoord = local_beg_gidx[0]-1; + int bottomcoord = local_end_gidx[0]+1; + int z_ext = u.local.extent(0); + + // double topplane[n2][n1]; + // double bottomplane[n2][n1]; + std::vector topplane(n2*n1); + std::vector bottomplane(n2*n1); + + dash::Future fut_top; + dash::Future fut_bot; + + if ( topcoord >= 0 && z_ext > 0 ) { + fut_top = dash::copy_async(r.begin()+n2*n1*topcoord, r.begin()+n2*n1*(topcoord+1), &topplane[0]); + } + + if ( bottomcoord < n3 && z_ext > 0 ) { + fut_bot = dash::copy_async(r.begin()+n2*n1*bottomcoord, r.begin()+n2*n1*(bottomcoord+1), &bottomplane[0]); + } + + for (int i3 = 1; i3 < z_ext-1; i3++) { + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + r.local(i3-1,i2,i1) + r.local(i3+1,i2,i1); + r2[i1] = r.local(i3-1,i2-1,i1) + r.local(i3-1,i2+1,i1) + r.local(i3+1,i2-1,i1) + r.local(i3+1,i2+1,i1); + } + for (int i1 = 1; i1 < n1-1; i1++) { + u.local(i3,i2,i1) = u.local(i3,i2,i1) + + c[0] * r.local(i3,i2,i1) + + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] ) + + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] ); + //-------------------------------------------------------------------- + //c Assume c(3) = 0 (Enable line below if c(3) not= 0) + //c------------------------------------------------------------------- + //c > + c(3) * ( r2(i1-1) + r2(i1+1) ) + //c------------------------------------------------------------------- + } + } + } + + if ( z_ext > 1 ) { + if ( topcoord >= 0 ) { + int i3 = 0; + fut_top.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + topplane[i2*n2+i1] + r.local(i3+1,i2,i1); + r2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + r.local(i3+1,i2-1,i1) + r.local(i3+1,i2+1,i1); + } + for (int i1 = 1; i1 < n1-1; i1++) { + u.local(i3,i2,i1) = u.local(i3,i2,i1) + + c[0] * r.local(i3,i2,i1) + + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] ) + + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] ); + //-------------------------------------------------------------------- + //c Assume c(3) = 0 (Enable line below if c(3) not= 0) + //c------------------------------------------------------------------- + //c > + c(3) * ( r2(i1-1) + r2(i1+1) ) + //c------------------------------------------------------------------- + } + } + } + if ( bottomcoord < n3 ) { + int i3 = z_ext-1; + fut_bot.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + r.local(i3-1,i2,i1) + bottomplane[i2*n2+i1]; + r2[i1] = r.local(i3-1,i2-1,i1) + r.local(i3-1,i2+1,i1) + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1]; + } + for (int i1 = 1; i1 < n1-1; i1++) { + u.local(i3,i2,i1) = u.local(i3,i2,i1) + + c[0] * r.local(i3,i2,i1) + + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] ) + + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] ); + //-------------------------------------------------------------------- + //c Assume c(3) = 0 (Enable line below if c(3) not= 0) + //c------------------------------------------------------------------- + //c > + c(3) * ( r2(i1-1) + r2(i1+1) ) + //c------------------------------------------------------------------- + } + } + } + } else { + if ( 0 <= topcoord && bottomcoord < n3 ) { + int i3 = 0; + fut_top.wait(); + fut_bot.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + topplane[i2*n2+i1] + bottomplane[i2*n2+i1]; + r2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1]; + } + for (int i1 = 1; i1 < n1-1; i1++) { + u.local(i3,i2,i1) = u.local(i3,i2,i1) + + c[0] * r.local(i3,i2,i1) + + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] ) + + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] ); + //-------------------------------------------------------------------- + //c Assume c(3) = 0 (Enable line below if c(3) not= 0) + //c------------------------------------------------------------------- + //c > + c(3) * ( r2(i1-1) + r2(i1+1) ) + //c------------------------------------------------------------------- + } + } + } + } + /*-------------------------------------------------------------------- + c exchange boundary points + c-------------------------------------------------------------------*/ + comm3(u,n1,n2,n3); + + if ( debug_vec[0] >= 1 ) { + rep_nrm(u,n1,n2,n3,(char*)" psinv",k); + } + + if ( debug_vec[3] >= k ) { + showall(u,n1,n2,n3); + } + } + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void resid(dash::NArray &u, dash::NArray &v, dash::NArray &r, int n1, int n2, int n3, double a[4], int k) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c resid computes the residual: r = v - Au + c + c This implementation costs 15A + 4M per result, where + c A and M denote the costs of Addition (or Subtraction) and + c Multiplication, respectively. + c Presuming coefficient a(1) is zero (the NPB assumes this, + c but it is thus not a general case), 3A + 1M may be eliminated, + c resulting in 12A + 3M. + c Note that this vectorizes, and is also fine for cache + c based machines. + c-------------------------------------------------------------------*/ + // double u1[n1], u2[n1]; + std::vector u1(n1); + std::vector u2(n1); + + int myid = dash::myid(); + + auto pattern = r.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + int topcoord = local_beg_gidx[0]-1; + int bottomcoord = local_end_gidx[0]+1; + int z_ext = u.local.extent(0); + + std::vector topplane(n2*n1); + std::vector bottomplane(n2*n1); + + dash::Future fut_top; + dash::Future fut_bot; + + if ( topcoord >= 0 && z_ext > 0 ) { + fut_top = dash::copy_async(u.begin()+n2*n1*topcoord, u.begin()+n2*n1*(topcoord+1), &topplane[0]); + } + + if ( bottomcoord < n3 && z_ext > 0 ) { + fut_bot = dash::copy_async(u.begin()+n2*n1*bottomcoord, u.begin()+n2*n1*(bottomcoord+1), &bottomplane[0]); + } + + for (int i3 = 1; i3 < z_ext-1; i3++) { + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + u.local(i3-1,i2,i1) + u.local(i3+1,i2,i1); + u2[i1] = u.local(i3-1,i2-1,i1) + u.local(i3-1,i2+1,i1) + u.local(i3+1,i2-1,i1) + u.local(i3+1,i2+1,i1); + } + for (int i1 = 1; i1 < n1-1; i1++) { + r.local(i3,i2,i1) = v.local(i3,i2,i1) + - a[0] * u.local(i3,i2,i1) + //-------------------------------------------------------------------- + //c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) + //c------------------------------------------------------------------- + //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) + //c > + u1(i1) ) + //c------------------------------------------------------------------- + - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] ) + - a[3] * ( u2[i1-1] + u2[i1+1] ); + } + } + } + + if ( z_ext > 1 ) { + if ( topcoord >= 0 ) { + int i3 = 0; + fut_top.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + topplane[i2*n2+i1] + u.local(i3+1,i2,i1); + u2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + u.local(i3+1,i2-1,i1) + u.local(i3+1,i2+1,i1); + } + for (int i1 = 1; i1 < n1-1; i1++) { + r.local(i3,i2,i1) = v.local(i3,i2,i1) + - a[0] * u.local(i3,i2,i1) + //-------------------------------------------------------------------- + //c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) + //c------------------------------------------------------------------- + //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) + //c > + u1(i1) ) + //c------------------------------------------------------------------- + - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] ) + - a[3] * ( u2[i1-1] + u2[i1+1] ); + } + } + } + if ( bottomcoord < n3 ) { + int i3 = z_ext-1; + fut_bot.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + u.local(i3-1,i2,i1) + bottomplane[i2*n2+i1]; + u2[i1] = u.local(i3-1,i2-1,i1) + u.local(i3-1,i2+1,i1) + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1]; + } + for (int i1 = 1; i1 < n1-1; i1++) { + r.local(i3,i2,i1) = v.local(i3,i2,i1) + - a[0] * u.local(i3,i2,i1) + //-------------------------------------------------------------------- + //c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) + //c------------------------------------------------------------------- + //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) + //c > + u1(i1) ) + //c------------------------------------------------------------------- + - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] ) + - a[3] * ( u2[i1-1] + u2[i1+1] ); + } + } + } + } else { + if ( 0 <= topcoord && bottomcoord < n3 ) { + int i3 = 0; + fut_top.wait(); + fut_bot.wait(); + + for (int i2 = 1; i2 < n2-1; i2++) { + for (int i1 = 0; i1 < n1; i1++) { + u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + topplane[i2*n2+i1] + bottomplane[i2*n2+i1]; + u2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1]; + } + for (int i1 = 1; i1 < n1-1; i1++) { + r.local(i3,i2,i1) = v.local(i3,i2,i1) + - a[0] * u.local(i3,i2,i1) + //-------------------------------------------------------------------- + //c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) + //c------------------------------------------------------------------- + //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) + //c > + u1(i1) ) + //c------------------------------------------------------------------- + - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] ) + - a[3] * ( u2[i1-1] + u2[i1+1] ); + } + } + } + } + + /*-------------------------------------------------------------------- + c exchange boundary data + c--------------------------------------------------------------------*/ + comm3(r,n1,n2,n3); + + if ( debug_vec[0] >= 1 ) { + rep_nrm(r,n1,n2,n3,(char*)" resid",k); + } + + if ( debug_vec[2] >= k ) { + showall(r,n1,n2,n3); + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void rprj3(dash::NArray &r, int m1k, int m2k, int m3k, dash::NArray &s, int m1j, int m2j, int m3j, int k) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c rprj3 projects onto the next coarser grid, + c using a trilinear Finite Element projection: s = r' = P r + c + c This implementation costs 20A + 4M per result, where + c A and M denote the costs of Addition and Multiplication. + c Note that this vectorizes, and is also fine for cache + c based machines. + c-------------------------------------------------------------------*/ + + int d1, d2, d3; + + if ( m1k == 3 ) { + d1 = 2; + } else { + d1 = 1; + } + + if ( m2k == 3 ) { + d2 = 2; + } else { + d2 = 1; + } + + if ( m3k == 3 ) { + d3 = 2; + } else { + d3 = 1; + } + + int j2, j1, i3, i2, i1; + double x1[m1k], y1[m1k], x2, y2; + + auto s_pattern = s.pattern(); + auto s_local_beg_gidx = s_pattern.coords(s_pattern.global(0)); + auto s_local_end_gidx = s_pattern.coords(s_pattern.global(s_pattern.local_size()-1)); + + auto r_pattern = r.pattern(); + auto r_local_beg_gidx = r_pattern.coords(r_pattern.global(0)); + auto r_local_end_gidx = r_pattern.coords(r_pattern.global(r_pattern.local_size()-1)); + int r_offset = r_local_beg_gidx[0]; + int r_x = (int) r.extent(2); + int r_y = (int) r.extent(1); + + int start = 0; + if ( s_local_beg_gidx[0] == 0 ) start++; + + int end = s.local.extent(0); + if ( s_local_end_gidx[0] == s.extent(0)-1 ) end--; + + int r_planes = 2*(end-start)+1; + + std::vector > r_local_v(r_planes); + + for (int i = 0; i < r_planes; i++) { + r_local_v[i] = std::vector (r_y*r_x); + } + + std::vector r_local(r_planes); + std::vector is_local(r_planes); + int r_idx = 0; + int r_psize = r_y*r_x; + + dash::Future futs[r_planes]; + + for (int j3l = start; j3l < end; j3l++){ + int j3 = s_local_beg_gidx[0]+j3l; + i3 = 2*j3-d3; + if ( r(i3,0,0).is_local() ) { + r_local[r_idx] = r.lbegin()+r_psize*(i3-r_offset); + is_local[r_idx] = true; + } else { + futs[r_idx] = dash::copy_async(r.begin()+r_psize*i3, r.begin()+r_psize*(i3+1), r_local_v[r_idx].data()); + r_local[r_idx] = r_local_v[r_idx].data(); + } + r_idx++; + if ( r(i3+1,0,0).is_local() ) { + r_local[r_idx] = r.lbegin()+r_psize*(i3+1-r_offset); + is_local[r_idx] = true; + } else { + futs[r_idx] = dash::copy_async(r.begin()+r_psize*(i3+1), r.begin()+r_psize*(i3+2), r_local_v[r_idx].data()); + r_local[r_idx] = r_local_v[r_idx].data(); + } + r_idx++; + } + + + if ( start < end ) { + i3 = 2*(s_local_beg_gidx[0]+end)-d3; + if ( r(i3,0,0).is_local() ) { + r_local[r_idx] = r.lbegin()+r_psize*(i3-r_offset); + is_local[r_idx] = true; + } else { + futs[r_idx] = dash::copy_async(r.begin()+r_psize*i3, r.begin()+r_psize*(i3+1), r_local_v[r_idx].data()); + r_local[r_idx] = r_local_v[r_idx].data(); + } + } + + //First, do the local calculations... + r_idx = 0; + + for (int j3l = start; j3l < end; j3l++) { + if ( is_local[r_idx+0] && is_local[r_idx+1] && is_local[r_idx+2] ) { + int j3 = s_local_beg_gidx[0]+j3l; + i3 = 2*j3-d3; + //C i3 = 2*j3-1 + + for (j2 = 1; j2 < m2j-1; j2++) { + i2 = 2*j2-d2; + //C i2 = 2*j2-1 + + for (j1 = 1; j1 < m1j; j1++) { + i1 = 2*j1-d1; + //C i1 = 2*j1-1 + x1[i1] = r_local[r_idx+1][i2*r_x+i1] + r_local[r_idx+1][(i2+2)*r_x+i1] + r_local[r_idx+0][(i2+1)*r_x+i1] + r_local[r_idx+2][(i2+1)*r_x+i1]; + y1[i1] = r_local[r_idx+0][i2*r_x+i1] + r_local[r_idx+2][i2*r_x+i1] + r_local[r_idx+0][(i2+2)*r_x+i1] + r_local[r_idx+2][(i2+2)*r_x+i1]; + } + + for (j1 = 1; j1 < m1j-1; j1++) { + i1 = 2*j1-d1; + //C i1 = 2*j1-1 + y2 = r_local[r_idx+0][i2*r_x+i1+1] + r_local[r_idx+2][i2*r_x+i1+1] + r_local[r_idx+0][(i2+2)*r_x+i1+1] + r_local[r_idx+2][(i2+2)*r_x+i1+1]; + x2 = r_local[r_idx+1][i2*r_x+i1+1] + r_local[r_idx+1][(i2+2)*r_x+i1+1] + r_local[r_idx+0][(i2+1)*r_x+i1+1] + r_local[r_idx+2][(i2+1)*r_x+i1+1]; + s.local(j3l,j2,j1) = + 0.5 * r_local[r_idx+1][(i2+1)*r_x+i1+1] + + 0.25 * ( r_local[r_idx+1][(i2+1)*r_x+i1] + r_local[r_idx+1][(i2+1)*r_x+i1+2] + x2) + + 0.125 * ( x1[i1] + x1[i1+2] + y2) + + 0.0625 * ( y1[i1] + y1[i1+2] ); + } + } + } + r_idx = r_idx + 2; + } + + //...then do the non-local calculations. + r_idx = 0; + + for (int j3l = start; j3l < end; j3l++) { + if ( !(is_local[r_idx+0] && is_local[r_idx+1] && is_local[r_idx+2]) ) { + int j3 = s_local_beg_gidx[0]+j3l; + i3 = 2*j3-d3; + //C i3 = 2*j3-1 + + if(!is_local[r_idx+0]) futs[r_idx+0].wait(); + if(!is_local[r_idx+1]) futs[r_idx+1].wait(); + if(!is_local[r_idx+2]) futs[r_idx+2].wait(); + + for (j2 = 1; j2 < m2j-1; j2++) { + i2 = 2*j2-d2; + //C i2 = 2*j2-1 + + for (j1 = 1; j1 < m1j; j1++) { + i1 = 2*j1-d1; + //C i1 = 2*j1-1 + x1[i1] = r_local[r_idx+1][i2*r_x+i1] + r_local[r_idx+1][(i2+2)*r_x+i1] + r_local[r_idx+0][(i2+1)*r_x+i1] + r_local[r_idx+2][(i2+1)*r_x+i1]; + y1[i1] = r_local[r_idx+0][i2*r_x+i1] + r_local[r_idx+2][i2*r_x+i1] + r_local[r_idx+0][(i2+2)*r_x+i1] + r_local[r_idx+2][(i2+2)*r_x+i1]; + } + + for (j1 = 1; j1 < m1j-1; j1++) { + i1 = 2*j1-d1; + //C i1 = 2*j1-1 + y2 = r_local[r_idx+0][i2*r_x+i1+1] + r_local[r_idx+2][i2*r_x+i1+1] + r_local[r_idx+0][(i2+2)*r_x+i1+1] + r_local[r_idx+2][(i2+2)*r_x+i1+1]; + x2 = r_local[r_idx+1][i2*r_x+i1+1] + r_local[r_idx+1][(i2+2)*r_x+i1+1] + r_local[r_idx+0][(i2+1)*r_x+i1+1] + r_local[r_idx+2][(i2+1)*r_x+i1+1]; + s.local(j3l,j2,j1) = + 0.5 * r_local[r_idx+1][(i2+1)*r_x+i1+1] + + 0.25 * ( r_local[r_idx+1][(i2+1)*r_x+i1] + r_local[r_idx+1][(i2+1)*r_x+i1+2] + x2) + + 0.125 * ( x1[i1] + x1[i1+2] + y2) + + 0.0625 * ( y1[i1] + y1[i1+2] ); + } + } + } + r_idx = r_idx + 2; + } + + comm3(s,m1j,m2j,m3j); + + if ( debug_vec[0] >= 1 ) { + rep_nrm(s,m1j,m2j,m3j,(char*)" rprj3",k-1); + } + + if ( debug_vec[4] >= k ) { + showall(s,m1j,m2j,m3j); + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void interp(dash::NArray &z, int mm1, int mm2, int mm3, dash::NArray &u, int n1, int n2, int n3, int k) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c interp adds the trilinear interpolation of the correction + c from the coarser grid to the current approximation: u = u + Qu' + c + c Observe that this implementation costs 16A + 4M, where + c A and M denote the costs of Addition and Multiplication. + c Note that this vectorizes, and is also fine for cache + c based machines. Vector machines may get slightly better + c performance however, with 8 separate "do i1" loops, rather than 4. + c-------------------------------------------------------------------*/ + + int d1, d2, d3, t1, t2, t3; + + /* + c note that m = 1037 in globals.h but for this only need to be + c 535 to handle up to 1024^3 + c integer m + c parameter( m=535 ) + */ + + if ( n1 != 3 && n2 != 3 && n3 != 3 ) { + + double z1[mm1], z2[mm1], z3[mm1]; + + auto pattern = u.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + auto z_pattern = z.pattern(); + auto z_local_beg_gidx = z_pattern.coords(z_pattern.global(0)); + auto z_local_end_gidx = z_pattern.coords(z_pattern.global(z_pattern.local_size()-1)); + int z_offset = z_local_beg_gidx[0]; + + int u_offset = local_beg_gidx[0]; + + int start = 0 - u_offset%2; + int u_planes = u.local.extent(0); + int z_size = mm2*mm1; + + if ( (int) u_planes > 0 ) { + int z_local_size = u_planes-start+((u_planes+start)%2); + + std::vector > z_local_v(z_local_size); + for (int i = 0; i < z_local_size; i++) { + z_local_v[i] = std::vector (mm2*mm1); + } + std::vector z_local(z_local_size); + std::vector is_local(z_local_size); + + dash::Future futs[z_local_size]; + + for (int j3 = start; j3 < u_planes; j3+=2) { + int i3 = (u_offset + j3)/2; + if ( z(i3,0,0).is_local() ) { + z_local[j3-start] = z.lbegin()+z_size*(i3-z_offset); + is_local[j3-start] = true; + } else { + futs[j3-start] = dash::copy_async(z.begin()+z_size*i3, z.begin()+z_size*(i3+1), z_local_v[j3-start].data()); + z_local[j3-start] = z_local_v[j3-start].data(); + } + if ( z(i3+1,0,0).is_local() ) { + z_local[j3-start+1] = z.lbegin()+z_size*(i3+1-z_offset); + is_local[j3-start+1] = true; + } else { + futs[j3-start+1] = dash::copy_async(z.begin()+z_size*(i3+1), z.begin()+z_size*(i3+2), z_local_v[j3-start+1].data()); + z_local[j3-start+1] = z_local_v[j3-start+1].data(); + } + } + + //First, do the local calculations... + for (int j3 = start; j3 < u_planes; j3+=2) { + // int i3 = (u_offset + j3)/2; + if ( is_local[j3-start] && is_local[j3-start+1] ) { + + for (int i2 = 0; i2 < mm2-1; i2++) { + for (int i1 = 0; i1 < mm1; i1++) { + z1[i1] = z_local[j3-start][(i2+1)*mm2+i1] + z_local[j3-start][i2*mm2+i1]; + z2[i1] = z_local[j3-start+1][i2*mm2+i1] + z_local[j3-start][i2*mm2+i1]; + z3[i1] = z_local[j3-start+1][(i2+1)*mm2+i1] + z_local[j3-start+1][i2*mm2+i1] + z1[i1]; + } + if ( j3 >= 0 ) { + for (int i1 = 0; i1 < mm1-1; i1++) { + u.local(j3,2*i2,2*i1) = u.local(j3,2*i2,2*i1) + z_local[j3-start][i2*mm2+i1]; + u.local(j3,2*i2,2*i1+1) = u.local(j3,2*i2,2*i1+1) + 0.5*(z_local[j3-start][i2*mm2+i1+1]+z_local[j3-start][i2*mm2+i1]); + u.local(j3,2*i2+1,2*i1) = u.local(j3,2*i2+1,2*i1) + 0.5 * z1[i1]; + u.local(j3,2*i2+1,2*i1+1) = u.local(j3,2*i2+1,2*i1+1) + 0.25*( z1[i1] + z1[i1+1] ); + } + } + if ( j3+1 < u_planes ) { + for (int i1 = 0; i1 < mm1-1; i1++) { + u.local(j3+1,2*i2,2*i1) = u.local(j3+1,2*i2,2*i1) + 0.5 * z2[i1]; + u.local(j3+1,2*i2,2*i1+1) = u.local(j3+1,2*i2,2*i1+1) + 0.25*( z2[i1] + z2[i1+1] ); + u.local(j3+1,2*i2+1,2*i1) = u.local(j3+1,2*i2+1,2*i1) + 0.25* z3[i1]; + u.local(j3+1,2*i2+1,2*i1+1) = u.local(j3+1,2*i2+1,2*i1+1) + 0.125*( z3[i1] + z3[i1+1] ); + } + } + } + } + } + + //...then do the non-local calculations. + for (int j3 = start; j3 < u_planes; j3+=2) { + if ( !(is_local[j3-start] && is_local[j3-start+1]) ) { + // int i3 = (u_offset + j3)/2; + if(!is_local[j3-start+0]) futs[j3-start].wait(); + if(!is_local[j3-start+1]) futs[j3-start+1].wait(); + + for (int i2 = 0; i2 < mm2-1; i2++) { + for (int i1 = 0; i1 < mm1; i1++) { + z1[i1] = z_local[j3-start][(i2+1)*mm2+i1] + z_local[j3-start][i2*mm2+i1]; + z2[i1] = z_local[j3-start+1][i2*mm2+i1] + z_local[j3-start][i2*mm2+i1]; + z3[i1] = z_local[j3-start+1][(i2+1)*mm2+i1] + z_local[j3-start+1][i2*mm2+i1] + z1[i1]; + } + if ( j3 >= 0 ) { + for (int i1 = 0; i1 < mm1-1; i1++) { + u.local(j3,2*i2,2*i1) = u.local(j3,2*i2,2*i1) + z_local[j3-start][i2*mm2+i1]; + u.local(j3,2*i2,2*i1+1) = u.local(j3,2*i2,2*i1+1) + 0.5*(z_local[j3-start][i2*mm2+i1+1]+z_local[j3-start][i2*mm2+i1]); + u.local(j3,2*i2+1,2*i1) = u.local(j3,2*i2+1,2*i1) + 0.5 * z1[i1]; + u.local(j3,2*i2+1,2*i1+1) = u.local(j3,2*i2+1,2*i1+1) + 0.25*( z1[i1] + z1[i1+1] ); + } + } + if ( j3+1 < u_planes ) { + for (int i1 = 0; i1 < mm1-1; i1++) { + u.local(j3+1,2*i2,2*i1) = u.local(j3+1,2*i2,2*i1) + 0.5 * z2[i1]; + u.local(j3+1,2*i2,2*i1+1) = u.local(j3+1,2*i2,2*i1+1) + 0.25*( z2[i1] + z2[i1+1] ); + u.local(j3+1,2*i2+1,2*i1) = u.local(j3+1,2*i2+1,2*i1) + 0.25* z3[i1]; + u.local(j3+1,2*i2+1,2*i1+1) = u.local(j3+1,2*i2+1,2*i1+1) + 0.125*( z3[i1] + z3[i1+1] ); + } + } + } + } + } + } + + } else { //this gets never called for our benchmark sizes, so we didn't parallelize it. + + if ( n1 == 3 ) { + d1 = 2; + t1 = 1; + } else { + d1 = 1; + t1 = 0; + } + if ( n2 == 3 ) { + d2 = 2; + t2 = 1; + } else { + d2 = 1; + t2 = 0; + } + if ( n3 == 3 ) { + d3 = 2; + t3 = 1; + } else { + d3 = 1; + t3 = 0; + } + if ( 0 == dash::myid() ) { + + for (int i3 = d3; i3 <= mm3-1; i3++) { + for (int i2 = d2; i2 <= mm2-1; i2++) { + for (int i1 = d1; i1 <= mm1-1; i1++) { + u[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1] = + u[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1] + +z[i3-1][i2-1][i1-1]; + } + for (int i1 = 1; i1 <= mm1-1; i1++) { + u[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1] = + u[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1] + +0.5*(z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]); + } + } + for (int i2 = 1; i2 <= mm2-1; i2++) { + for (int i1 = d1; i1 <= mm1-1; i1++) { + u[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1] = + u[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1] + +0.5*(z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]); + } + for (int i1 = 1; i1 <= mm1-1; i1++) { + u[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1] = + u[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1] + +0.25*(z[i3-1][i2][i1]+z[i3-1][i2-1][i1] + z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]); + } + } + } + + for (int i3 = 1; i3 <= mm3-1; i3++) { + for (int i2 = d2; i2 <= mm2-1; i2++) { + for (int i1 = d1; i1 <= mm1-1; i1++) { + u[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1] = + u[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1] + +0.5*(z[i3][i2-1][i1-1]+z[i3-1][i2-1][i1-1]); + } + for (int i1 = 1; i1 <= mm1-1; i1++) { + u[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1] = + u[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1] + +0.25*(z[i3][i2-1][i1]+z[i3][i2-1][i1-1] + z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]); + } + } + for (int i2 = 1; i2 <= mm2-1; i2++) { + for (int i1 = d1; i1 <= mm1-1; i1++) { + u[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1] = + u[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1] + +0.25*(z[i3][i2][i1-1]+z[i3][i2-1][i1-1] + z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]); + } + for (int i1 = 1; i1 <= mm1-1; i1++) { + u[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1] = + u[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1] + +0.125*(z[i3][i2][i1]+z[i3][i2-1][i1] + +z[i3][i2][i1-1]+z[i3][i2-1][i1-1] + +z[i3-1][i2][i1]+z[i3-1][i2-1][i1] + +z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]); + } + } + } + + + { + if ( debug_vec[0] >= 1 ) { + rep_nrm(z,mm1,mm2,mm3,(char*)"z: inter",k-1); + rep_nrm(u,n1,n2,n3,(char*)"u: inter",k); + } + if ( debug_vec[5] >= k ) { + showall(z,mm1,mm2,mm3); + showall(u,n1,n2,n3); + } + } + } + } + dash::barrier(); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void norm2u3(dash::NArray &r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c norm2u3 evaluates approximations to the L2 norm and the + c uniform (or L-infinity or Chebyshev) norm, under the + c assumption that the boundaries are periodic or zero. Add the + c boundaries in with half weight (quarter weight on the edges + c and eighth weight at the corners) for inhomogeneous boundaries. + c-------------------------------------------------------------------*/ + + double s, a; + double tmp; + int i3, i2, i1, n; + dash::Array p_s(dash::size()); + dash::Array p_a(dash::size()); + + auto pattern = r.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + int start = 0; + if ( local_beg_gidx[0] == 0 ) start++; + + int end = r.local.extent(0); + if ( local_end_gidx[0] == n3-1 ) end--; + + n = nx*ny*nz; + + for (i3 = start; i3 < end; i3++) { + for (i2 = 1; i2 < n2-1; i2++) { + for (i1 = 1; i1 < n1-1; i1++) { + p_s.local[0] = p_s.local[0] + r.local(i3,i2,i1) * r.local(i3,i2,i1); + tmp = fabs(r.local(i3,i2,i1)); + if ( tmp > p_a.local[0] ) p_a.local[0] = tmp; + } + } + } + + dash::barrier(); + s = dash::reduce(p_s.begin(), p_s.end(), 0.0, std::plus()); + a = (double) (*dash::max_element(p_a.begin(), p_a.end())); + if ( a > *rnmu ) *rnmu = a; + + *rnm2 = sqrt(s/(double)n); + +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void rep_nrm(dash::NArray &u, int n1, int n2, int n3, char *title, int kk) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c report on norm + c-------------------------------------------------------------------*/ + + double rnm2, rnmu; + norm2u3(u,n1,n2,n3,&rnm2,&rnmu,nx[kk],ny[kk],nz[kk]); + if ( dash::myid() == 0 ) printf(" Level%2d in %8s: norms =%21.14e%21.14e\n", kk, title, rnm2, rnmu); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void comm3(dash::NArray &u, int n1, int n2, int n3) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c comm3 organizes the communication on all borders + c-------------------------------------------------------------------*/ + auto pattern = u.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + int start = 0; + if ( local_beg_gidx[0] == 0 ) start++; + + int end = u.local.extent(0); + if ( local_end_gidx[0] == n3-1 ) end--; + + // axis = 1 + + for (int i3 = start; i3 < end; i3++) { + for (int i2 = 1; i2 < n2-1; i2++) { + u.local(i3,i2,n1-1) = u.local(i3,i2,1); + u.local(i3,i2,0) = u.local(i3,i2,n1-2); + } + } + + // axis = 2 + + for (int i3 = start; i3 < end; i3++) { + for (int i1 = 0; i1 < n1; i1++) { + u.local(i3,n2-1,i1) = u.local(i3,1,i1); + u.local(i3,0,i1) = u.local(i3,n2-2,i1); + } + } + + dash::barrier(); + // axis = 3 + if ( u(u.extent(0)-1,0,0).is_local() ) { + dash::copy(u.begin()+n2*n1*1, u.begin()+n2*n1*2, u.lbegin()+n2*n1*end); + } + + if ( u(0,0,0).is_local() ) { + dash::copy(u.begin()+n2*n1*(n3-2), u.begin()+n2*n1*(n3-1), u.lbegin()); + } + + dash::barrier(); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void zran3(dash::NArray &z, int n1, int n2, int n3, int nx, int ny, int k) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c zran3 loads +1 at ten randomly chosen points, + c loads -1 at a different ten random points, + c and zero elsewhere. + c-------------------------------------------------------------------*/ + + #define MM 10 + #define A pow(5.0,13) + #define X 314159265.e0 + + int i0, m0, m1; + /*int i1, i2, i3, d1, e1, e2, e3;*/ + int i1, i2, i3, d1, e2, e3; + double xx, x0, x1, a1, a2, ai; + + double ten[MM][2], best; + int i, j1[MM][2], j2[MM][2], j3[MM][2]; + + + /*double rdummy;*/ + + a1 = power( A, nx ); + a2 = power( A, nx*ny ); + + zero3(z,n1,n2,n3); + + i = is1-1+nx*(is2-1+ny*(is3-1)); + + ai = power( A, i ); + d1 = ie1 - is1 + 1; + /*e1 = ie1 - is1 + 2;*/ + e2 = ie2 - is2 + 2; + e3 = ie3 - is3 + 2; + x0 = X; + /*rdummy = */randlc( &x0, ai ); + auto pattern = z.pattern(); + auto local_beg_gidx = pattern.coords(pattern.global(0)); + auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1)); + + int start = 0; + if ( local_beg_gidx[0] == 0 ) start++; + + int end = z.local.extent(0); + if ( local_end_gidx[0] == z.extent(0)-1 ) end--; + + for (int j = 0; j < (int) local_beg_gidx[0]-1; j++){ + randlc( &x0, a2 ); + } + + for (i3 = start; i3 < end; i3++) { + x1 = x0; + for (i2 = 1; i2 < e2; i2++) { + xx = x1; + double tmp[d1+1]; + vranlc( d1, &xx, A, tmp); + for (int i = 1; i <= d1; i++) z.local(i3,i2,i) = tmp[i]; + // vranlc( d1, &xx, A, (double *) &(z[i3][i2][0])); + /*rdummy = */randlc( &x1, a1 ); + } + /*rdummy = */randlc( &x0, a2 ); + } + + dash::barrier(); + if ( dash::myid() == 0 ) { + + /*-------------------------------------------------------------------- + c call comm3(z,n1,n2,n3) + c call showall(z,n1,n2,n3) + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c each processor looks for twenty candidates + c-------------------------------------------------------------------*/ + + for (i = 0; i < MM; i++) { + ten[i][1] = 0.0; + j1[i][1] = 0; + j2[i][1] = 0; + j3[i][1] = 0; + ten[i][0] = 1.0; + j1[i][0] = 0; + j2[i][0] = 0; + j3[i][0] = 0; + } + double current; + for (i3 = 1; i3 < n3-1; i3++) { + for (i2 = 1; i2 < n2-1; i2++) { + for (i1 = 1; i1 < n1-1; i1++) { + current = z(i3,i2,i1); + if ( current > ten[0][1] ) { + ten[0][1] = current; + j1[0][1] = i1; + j2[0][1] = i2; + j3[0][1] = i3; + bubble( ten, j1, j2, j3, MM, 1 ); + } + if ( current < ten[0][0] ) { + ten[0][0] = current; + j1[0][0] = i1; + j2[0][0] = i2; + j3[0][0] = i3; + bubble( ten, j1, j2, j3, MM, 0 ); + } + } + } + } + /*-------------------------------------------------------------------- + c Now which of these are globally best? + c-------------------------------------------------------------------*/ + // i1 = MM - 1; + // i0 = MM - 1; + // int jg[4][MM][2]; + // for (i = MM - 1 ; i >= 0; i--) { + // best = z[j3[i1][1]][j2[i1][1]][j1[i1][1]]; + // if (best == z[j3[i1][1]][j2[i1][1]][j1[i1][1]]) { + // jg[0][i][1] = 0; + // jg[1][i][1] = is1 - 1 + j1[i1][1]; + // jg[2][i][1] = is2 - 1 + j2[i1][1]; + // jg[3][i][1] = is3 - 1 + j3[i1][1]; + // i1 = i1-1; + // } else { + // jg[0][i][1] = 0; + // jg[1][i][1] = 0; + // jg[2][i][1] = 0; + // jg[3][i][1] = 0; + // } + // ten[i][1] = best; + // best = z[j3[i0][0]][j2[i0][0]][j1[i0][0]]; + // if (best == z[j3[i0][0]][j2[i0][0]][j1[i0][0]]) { + // jg[0][i][0] = 0; + // jg[1][i][0] = is1 - 1 + j1[i0][0]; + // jg[2][i][0] = is2 - 1 + j2[i0][0]; + // jg[3][i][0] = is3 - 1 + j3[i0][0]; + // i0 = i0-1; + // } else { + // jg[0][i][0] = 0; + // jg[1][i][0] = 0; + // jg[2][i][0] = 0; + // jg[3][i][0] = 0; + // } + // ten[i][0] = best; + // } + // m1 = i1+1; + // m0 = i0+1; + + /* printf(" negative charges at"); + for (i = 0; i < MM; i++) { + if (i%5 == 0) printf("\n"); + printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]); + } + printf("\n positive charges at"); + for (i = 0; i < MM; i++) { + if (i%5 == 0) printf("\n"); + printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]); + } + printf("\n small random numbers were\n"); + for (i = MM-1; i >= 0; i--) { + printf(" %15.8e", ten[i][0]); + } + printf("\n and they were found on processor number\n"); + for (i = MM-1; i >= 0; i--) { + printf(" %4d", jg[0][i][0]); + } + printf("\n large random numbers were\n"); + for (i = MM-1; i >= 0; i--) { + printf(" %15.8e", ten[i][1]); + } + printf("\n and they were found on processor number\n"); + for (i = MM-1; i >= 0; i--) { + printf(" %4d", jg[0][i][1]); + } + printf("\n");*/ +} + dash::barrier(); + zero3(z, n1, n2, n3); + dash::barrier(); + if ( dash::myid() == 0 ) { + + for (i = MM-1; i >= m0; i--) { + z[j3[i][0]][j2[i][0]][j1[i][0]] = -1.0; + } + for (i = MM-1; i >= m1; i--) { + z[j3[i][1]][j2[i][1]][j1[i][1]] = 1.0; + } + } + dash::barrier(); + comm3(z,n1,n2,n3); + + /*-------------------------------------------------------------------- + c call showall(z,n1,n2,n3) + c-------------------------------------------------------------------*/ + +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void showall(dash::NArray &z, int n1, int n2, int n3) { + if ( dash::myid() == 0 ) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + int i1,i2,i3; + int m1, m2, m3; + + m1 = min(n1,18); + m2 = min(n2,14); + m3 = min(n3,18); + + printf("\n"); + for (i3 = 0; i3 < m3; i3++) { + for (i1 = 0; i1 < m1; i1++) { + for (i2 = 0; i2 < m2; i2++) { + printf("%6.3f", (double) z[i3][i2][i1]); + } + printf("\n"); + } + printf(" - - - - - - - \n"); + } + printf("\n"); + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static double power(double a, int n) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c power raises an integer, disguised as a double + c precision real, to an integer power + c-------------------------------------------------------------------*/ + double aj; + int nj; + /* double rdummy;*/ + double power; + + power = 1.0; + nj = n; + aj = a; + + while (nj != 0) { + if ( (nj%2) == 1 ) /*rdummy = */randlc( &power, aj ); + /*rdummy = */randlc( &aj, aj ); + nj = nj/2; + } + + return (power); +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void bubble(double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind) { + + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + + /*-------------------------------------------------------------------- + c bubble does a bubble sort in direction dir + c-------------------------------------------------------------------*/ + double temp; + int i, j_temp; + if ( ind == 1 ) { + for (i = 0; i < m-1; i++) { + if ( ten[i][ind] > ten[i+1][ind] ) { + temp = ten[i+1][ind]; + ten[i+1][ind] = ten[i][ind]; + ten[i][ind] = temp; + + j_temp = j1[i+1][ind]; + j1[i+1][ind] = j1[i][ind]; + j1[i][ind] = j_temp; + + j_temp = j2[i+1][ind]; + j2[i+1][ind] = j2[i][ind]; + j2[i][ind] = j_temp; + + j_temp = j3[i+1][ind]; + j3[i+1][ind] = j3[i][ind]; + j3[i][ind] = j_temp; + } else { + return; + } + } + } else { + for (i = 0; i < m-1; i++) { + if ( ten[i][ind] < ten[i+1][ind] ) { + + temp = ten[i+1][ind]; + ten[i+1][ind] = ten[i][ind]; + ten[i][ind] = temp; + + j_temp = j1[i+1][ind]; + j1[i+1][ind] = j1[i][ind]; + j1[i][ind] = j_temp; + + j_temp = j2[i+1][ind]; + j2[i+1][ind] = j2[i][ind]; + j2[i][ind] = j_temp; + + j_temp = j3[i+1][ind]; + j3[i+1][ind] = j3[i][ind]; + j3[i][ind] = j_temp; + } else { + return; + } + } + } +} + +/*-------------------------------------------------------------------- +c-------------------------------------------------------------------*/ + +static void zero3(dash::NArray &z, int n1, int n2, int n3) { + /*-------------------------------------------------------------------- + c-------------------------------------------------------------------*/ + dash::fill(z.begin(), z.end(), 0.0); +} + +/*---- end of program ------------------------------------------------*/ diff --git a/NPB/Makefile b/NPB/Makefile new file mode 100644 index 0000000..19ee235 --- /dev/null +++ b/NPB/Makefile @@ -0,0 +1,71 @@ +SHELL=/bin/sh +CLASS=W +SFILE=config/suite.def + +default: header + @ $(SHELL) sys/print_instructions + +BT: bt +bt: header + cd BT; $(MAKE) CLASS=$(CLASS) + +SP: sp +sp: header + cd SP; $(MAKE) CLASS=$(CLASS) + +LU: lu +lu: header + cd LU; $(MAKE) CLASS=$(CLASS) + +MG: mg +mg: header + cd MG; $(MAKE) CLASS=$(CLASS) + +FT: ft +ft: header + cd FT; $(MAKE) CLASS=$(CLASS) + +IS: is +is: header + cd IS; $(MAKE) CLASS=$(CLASS) + +CG: cg +cg: header + cd CG; $(MAKE) CLASS=$(CLASS) + +EP: ep +ep: header + cd EP; $(MAKE) CLASS=$(CLASS) +DC: dc +dc: header + cd DC; $(MAKE) CLASS=$(CLASS) + +# Awk script courtesy cmg@cray.com +suite: + @ awk '{ if ($$1 !~ /^#/ && NF > 0) \ + printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE) \ + | $(SHELL) + + +# It would be nice to make clean in each subdirectory (the targets +# are defined) but on a really clean system this will won't work +# because those makefiles need config/make.def +clean: + - rm -f core + - rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe + - rm -f sys/setparams sys/makesuite sys/setparams.hpp + +cleanall: clean + - rm -r bin/* + +veryclean: clean + - rm config/make.def config/suite.def Part* + - rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.* + +header: + @ $(SHELL) sys/print_header + +kit: + - makekit -s100k -k30 * */* */*/* + + diff --git a/NPB/README.md b/NPB/README.md new file mode 100644 index 0000000..9dc45f4 --- /dev/null +++ b/NPB/README.md @@ -0,0 +1,79 @@ +To use the C++NPB, adjust the config/make.def and rename the .cpp's. The makefile only accepts ep.cpp, mg.cpp, cg.cpp, is.cpp and ft.cpp. + +Below is the original makefile for Griebler et al. + +# How to Cite our Work + +D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. + +# The NPB-CPP Benchmark + +These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version. + + ================================================================== + NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB + + Code contributors: + Dalvan Griebler + Júnior Löff + + Warning: in case of problems send an email to us: + dalvan.griebler@acad.pucrs.br + junior.loff@acad.pucrs.br + ================================================================== + + +This folder contains: + + - NPB-FF - Directory with the parallel version implemented in FastFlow + - NPB-OMP - Directory with the parallel version translated from the original NPB version + - NPB-SER - Directory with the serial version of the NPB ported to C++ + - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks + +Each directory is independent and contains its own implemented version of the kernels: + + IS - Integer Sort, random memory access + EP - Embarrassingly Parallel + CG - Conjugate Gradient, irregular memory access and communication + MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive + FT - discrete 3D fast Fourier Transform, all-to-all communication + +# Software Requiriments + +*Warning: our tests were made with GCC-5* + +**TBB** + +*Installation* + + apt-get install libtbb-dev + +**FastFlow** + +*Installation* + + svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow + + +# How to Compile + +Enter the directory from the version desired and execute: + + make _BENCHMARK CLASS=_VERSION + + +_BENCHMARKs are: + + EP, CG, MG, IS and FT + +_VERSIONs are: + + Class S: small for quick test purposes + Class W: workstation size (a 90's workstation; now likely too small) + Classes A, B, C: standard test problems; ~4X size increase going from one class to the next + Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes + + +Command: + + make ep CLASS=B \ No newline at end of file diff --git a/NPB/common/c_print_results.cpp b/NPB/common/c_print_results.cpp new file mode 100755 index 0000000..9d4d21c --- /dev/null +++ b/NPB/common/c_print_results.cpp @@ -0,0 +1,70 @@ +/*****************************************************************/ +/****** C _ P R I N T _ R E S U L T S ******/ +/*****************************************************************/ +#include +#include + +void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, int nthreads, double t, + double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, + char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand) +{ + + printf( "\n\n %s Benchmark Completed\n", name ); + + printf( " class_npb = %c\n", class_npb ); + + if( n2 == 0 && n3 == 0 ) + printf( " Size = %12d\n", n1 ); /* as in IS */ + else + printf( " Size = %3dx%3dx%3d\n", n1,n2,n3 ); + + printf( " Iterations = %12d\n", niter ); + + printf( " Threads = %12d\n", nthreads ); + + printf( " Time in seconds = %12.2f\n", t ); + + printf( " Mop/s total = %12.2f\n", mops ); + + printf( " Operation type = %24s\n", optype); + + if( passed_verification ) + printf( " Verification = SUCCESSFUL\n" ); + else + printf( " Verification = UNSUCCESSFUL\n" ); + + printf( " Version = %12s\n", npbversion ); + + printf( " Compile date = %12s\n", compiletime ); + + printf( "\n Compile options:\n" ); + + printf( " CC = %s\n", cc ); + + printf( " CLINK = %s\n", clink ); + + printf( " C_LIB = %s\n", c_lib ); + + printf( " C_INC = %s\n", c_inc ); + + printf( " CFLAGS = %s\n", cflags ); + + printf( " CLINKFLAGS = %s\n", clinkflags ); + + printf( " RAND = %s\n", rand ); +#ifdef SMP + char *evalue = getenv("MP_SET_NUMTHREADS"); + printf( " MULTICPUS = %s\n", evalue ); +#endif + +/* printf( "\n\n" ); + printf( " Please send the results of this run to:\n\n" ); + printf( " NPB Development Team\n" ); + printf( " Internet: npb@nas.nasa.gov\n \n" ); + printf( " If email is not available, send this to:\n\n" ); + printf( " MS T27A-1\n" ); + printf( " NASA Ames Research Center\n" ); + printf( " Moffett Field, CA 94035-1000\n\n" ); + printf( " Fax: 415-604-3957\n\n" );*/ +} + diff --git a/NPB/common/c_randdp.cpp b/NPB/common/c_randdp.cpp new file mode 100755 index 0000000..521e562 --- /dev/null +++ b/NPB/common/c_randdp.cpp @@ -0,0 +1,137 @@ +/* +*/ +#if defined(USE_POW) +#define r23 pow(0.5, 23.0) +#define r46 (r23*r23) +#define t23 pow(2.0, 23.0) +#define t46 (t23*t23) +#else +#define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5) +#define r46 (r23*r23) +#define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0) +#define t46 (t23*t23) +#endif + +/*c--------------------------------------------------------------------- +c---------------------------------------------------------------------*/ + +double randlc (double *x, double a) { + +/*c--------------------------------------------------------------------- +c---------------------------------------------------------------------*/ + +/*c--------------------------------------------------------------------- +c +c This routine returns a uniform pseudorandom double precision number in the +c range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The returned value RANDLC is normalized to be +c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +c the new seed x_1, so that subsequent calls to RANDLC using the same +c arguments will generate a continuous sequence. +c +c This routine should produce the same results on any computer with at least +c 48 mantissa bits in double precision floating point data. On 64 bit +c systems, double precision should be disabled. +c +c David H. Bailey October 26, 1990 +c +c---------------------------------------------------------------------*/ + + double t1,t2,t3,t4,a1,a2,x1,x2,z; + +/*c--------------------------------------------------------------------- +c Break A into two parts such that A = 2^23 * A1 + A2. +c---------------------------------------------------------------------*/ + t1 = r23 * a; + a1 = (int)t1; + a2 = a - t23 * a1; + +/*c--------------------------------------------------------------------- +c Break X into two parts such that X = 2^23 * X1 + X2, compute +c Z = A1 * X2 + A2 * X1 (mod 2^23), and then +c X = 2^23 * Z + A2 * X2 (mod 2^46). +c---------------------------------------------------------------------*/ + t1 = r23 * (*x); + x1 = (int)t1; + x2 = (*x) - t23 * x1; + t1 = a1 * x2 + a2 * x1; + t2 = (int)(r23 * t1); + z = t1 - t23 * t2; + t3 = t23 * z + a2 * x2; + t4 = (int)(r46 * t3); + (*x) = t3 - t46 * t4; + + return (r46 * (*x)); +} + +/*c--------------------------------------------------------------------- +c---------------------------------------------------------------------*/ + +void vranlc (int n, double *x_seed, double a, double y[]) { + +/*c--------------------------------------------------------------------- +c---------------------------------------------------------------------*/ + +/*c--------------------------------------------------------------------- +c +c This routine generates N uniform pseudorandom double precision numbers in +c the range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The N results are placed in Y and are normalized +c to be between 0 and 1. X is updated to contain the new seed, so that +c subsequent calls to VRANLC using the same arguments will generate a +c continuous sequence. If N is zero, only initialization is performed, and +c the variables X, A and Y are ignored. +c +c This routine is the standard version designed for scalar or RISC systems. +c However, it should produce the same results on any single processor +c computer with at least 48 mantissa bits in double precision floating point +c data. On 64 bit systems, double precision should be disabled. +c +c---------------------------------------------------------------------*/ + + int i; + double x,t1,t2,t3,t4,a1,a2,x1,x2,z; + +/*c--------------------------------------------------------------------- +c Break A into two parts such that A = 2^23 * A1 + A2. +c---------------------------------------------------------------------*/ + t1 = r23 * a; + a1 = (int)t1; + a2 = a - t23 * a1; + x = *x_seed; + +/*c--------------------------------------------------------------------- +c Generate N results. This loop is not vectorizable. +c---------------------------------------------------------------------*/ + for (i = 1; i <= n; i++) { + +/*c--------------------------------------------------------------------- +c Break X into two parts such that X = 2^23 * X1 + X2, compute +c Z = A1 * X2 + A2 * X1 (mod 2^23), and then +c X = 2^23 * Z + A2 * X2 (mod 2^46). +c---------------------------------------------------------------------*/ + t1 = r23 * x; + x1 = (int)t1; + x2 = x - t23 * x1; + t1 = a1 * x2 + a2 * x1; + t2 = (int)(r23 * t1); + z = t1 - t23 * t2; + t3 = t23 * z + a2 * x2; + t4 = (int)(r46 * t3); + x = t3 - t46 * t4; + y[i] = r46 * x; + } + *x_seed = x; +} \ No newline at end of file diff --git a/NPB/common/c_timers.cpp b/NPB/common/c_timers.cpp new file mode 100755 index 0000000..f38f92c --- /dev/null +++ b/NPB/common/c_timers.cpp @@ -0,0 +1,65 @@ + + + +#include "wtime.hpp" +#include + +/* Prototype */ +void wtime( double * ); + + + +/*****************************************************************/ +/****** E L A P S E D _ T I M E ******/ +/*****************************************************************/ +double elapsed_time( void ) +{ + double t; + + wtime( &t ); + return( t ); +} + + +double start[64], elapsed[64]; + +/*****************************************************************/ +/****** T I M E R _ C L E A R ******/ +/*****************************************************************/ +void timer_clear( int n ) +{ + elapsed[n] = 0.0; +} + + +/*****************************************************************/ +/****** T I M E R _ S T A R T ******/ +/*****************************************************************/ +void timer_start( int n ) +{ + start[n] = elapsed_time(); +} + + +/*****************************************************************/ +/****** T I M E R _ S T O P ******/ +/*****************************************************************/ +void timer_stop( int n ) +{ + double t, now; + + now = elapsed_time(); + t = now - start[n]; + elapsed[n] += t; + +} + + +/*****************************************************************/ +/****** T I M E R _ R E A D ******/ +/*****************************************************************/ +double timer_read( int n ) +{ + return( elapsed[n] ); +} + diff --git a/NPB/common/npb-CPP.hpp b/NPB/common/npb-CPP.hpp new file mode 100755 index 0000000..07181e9 --- /dev/null +++ b/NPB/common/npb-CPP.hpp @@ -0,0 +1,52 @@ +#include +#include +#include +#if defined(_OPENMP) +#include +#endif /* _OPENMP */ + +typedef int boolean; +typedef struct { double real; double imag; } dcomplex; + +#define TRUE 1 +#define FALSE 0 + +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#define pow2(a) ((a)*(a)) + +#define get_real(c) c.real +#define get_imag(c) c.imag +#define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) +#define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) +#define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \ + c.imag = a.real * b.imag + a.imag * b.real) +#define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b) + +dcomplex cmulret(dcomplex a, dcomplex b) { + dcomplex res; + res.real = a.real * b.real - a.imag * b.imag; + res.imag = a.real * b.imag + a.imag * b.real; + return res; +} + +dcomplex crmulret(dcomplex a, double b) { + dcomplex res; + res.real = a.real * b; + res.imag = a.imag * b; + return res; +} + +extern double randlc(double *, double); +extern void vranlc(int, double *, double, double *); +extern void timer_clear(int); +extern void timer_start(int); +extern void timer_stop(int); +extern double timer_read(int); + +extern void c_print_results(char *name, char class_npb, int n1, int n2, + int n3, int niter, int nthreads, double t, + double mops, char *optype, int passed_verification, + char *npbversion, char *compiletime, char *cc, + char *clink, char *c_lib, char *c_inc, + char *cflags, char *clinkflags, char *rand); diff --git a/NPB/common/util.h b/NPB/common/util.h new file mode 100755 index 0000000..b47c1b4 --- /dev/null +++ b/NPB/common/util.h @@ -0,0 +1,16 @@ + +#ifndef UTIL_H_INCLUDED +#define UTIL_H_INCLUDED + +#include + +#define TIMESTAMP(time_) \ + do { \ + struct timespec ts; \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + time_=((double)ts.tv_sec)+(1.0e-9)*((double)ts.tv_nsec); \ + } while(0) + + +#endif /* UTIL_H_INCLUDED */ + diff --git a/NPB/common/wtime.cpp b/NPB/common/wtime.cpp new file mode 100755 index 0000000..33f2fe6 --- /dev/null +++ b/NPB/common/wtime.cpp @@ -0,0 +1,13 @@ +#include "wtime.hpp" +#include + +void wtime(double *t) +{ + static int sec = -1; + struct timeval tv; + gettimeofday(&tv, 0); + if (sec < 0) sec = tv.tv_sec; + *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; +} + + diff --git a/NPB/common/wtime.hpp b/NPB/common/wtime.hpp new file mode 100755 index 0000000..12eb0cb --- /dev/null +++ b/NPB/common/wtime.hpp @@ -0,0 +1,12 @@ +/* C/Fortran interface is different on different machines. + * You may need to tweak this. + */ + + +#if defined(IBM) +#define wtime wtime +#elif defined(CRAY) +#define wtime WTIME +#else +#define wtime wtime_ +#endif diff --git a/NPB/common/wtime_sgi64.cpp b/NPB/common/wtime_sgi64.cpp new file mode 100755 index 0000000..6c810d2 --- /dev/null +++ b/NPB/common/wtime_sgi64.cpp @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include +#include +#include + +/* The following works on SGI Power Challenge systems */ + +typedef unsigned long iotimer_t; + +unsigned int cycleval; +volatile iotimer_t *iotimer_addr, base_counter; +double resolution; + +/* address_t is an integer type big enough to hold an address */ +typedef unsigned long address_t; + + + +void timer_init() +{ + + int fd; + char *virt_addr; + address_t phys_addr, page_offset, pagemask, pagebase_addr; + + pagemask = getpagesize() - 1; + errno = 0; + phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); + if (errno != 0) { + perror("SGI_QUERY_CYCLECNTR"); + exit(1); + } + /* rel_addr = page offset of physical address */ + page_offset = phys_addr & pagemask; + pagebase_addr = phys_addr - page_offset; + fd = open("/dev/mmem", O_RDONLY); + + virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); + virt_addr = virt_addr + page_offset; + iotimer_addr = (iotimer_t *)virt_addr; + /* cycleval in picoseconds to this gives resolution in seconds */ + resolution = 1.0e-12*cycleval; + base_counter = *iotimer_addr; +} + +void wtime_(double *time) +{ + static int initialized = 0; + volatile iotimer_t counter_value; + if (!initialized) { + timer_init(); + initialized = 1; + } + counter_value = *iotimer_addr - base_counter; + *time = (double)counter_value * resolution; +} + + +void wtime(double *time) +{ + static int initialized = 0; + volatile iotimer_t counter_value; + if (!initialized) { + timer_init(); + initialized = 1; + } + counter_value = *iotimer_addr - base_counter; + *time = (double)counter_value * resolution; +} + + diff --git a/NPB/config/make.def b/NPB/config/make.def new file mode 100755 index 0000000..63cd602 --- /dev/null +++ b/NPB/config/make.def @@ -0,0 +1,101 @@ +#--------------------------------------------------------------------------- +# +# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. +# +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Items in this file will need to be changed for each platform. +# (Note these definitions are inconsistent with NPB2.1.) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Parallel C: +# +# CC - C compiler +# CFLAGS - C compilation arguments +# C_INC - any -I arguments required for compiling C +# CLINK - C linker +# CLINKFLAGS - C linker flags +# C_LIB - any -L and -l arguments required for linking C +# +# compilations are done with $(CC) $(C_INC) $(CFLAGS) or +# $(CC) $(CFLAGS) +# linking is done with $(CLINK) $(C_LIB) $(CLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the C compiler used for DASH programs +#--------------------------------------------------------------------------- +CC = dash-mpicxx +#gcc #cc +# This links C programs; usually the same as ${CC} +CLINK = $(CC) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker +#--------------------------------------------------------------------------- +C_LIB = -L$(HOME)/opt/dash-0.4.0/lib + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler +#--------------------------------------------------------------------------- +C_INC = -I../common -I$(HOME)/opt/dash-0.4.0/include + +#--------------------------------------------------------------------------- +# Global *compile time* flags for C programs (use -w3 -diag-disable 981 -diag-disable 383) +#--------------------------------------------------------------------------- +CFLAGS = -std=c++14 -O3 + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. (use -w3 -diag-disable 981 -diag-disable 383) +#--------------------------------------------------------------------------- +CLINKFLAGS = -lm -ldash-mpi -ldart-mpi -ldart-base -lhwloc -lnuma -lpthread + + +#--------------------------------------------------------------------------- +# Utilities C: +# +# This is the C compiler used to compile C utilities. Flags required by +# this compiler go here also; typically there are few flags required; hence +# there are no separate macros provided for such flags. +#--------------------------------------------------------------------------- +UCC = gcc +#UCC = cc + + +#--------------------------------------------------------------------------- +# Destination of executables, relative to subdirs of the main directory. . +#--------------------------------------------------------------------------- +BINDIR = ../bin + + +#--------------------------------------------------------------------------- +# The variable RAND controls which random number generator +# is used. It is described in detail in Doc/README.install. +# Use "randi8" unless there is a reason to use another one. +# Other allowed values are "randi8_safe", "randdp" and "randdpvec" +#--------------------------------------------------------------------------- +# RAND = randi8 +# The following is highly reliable but may be slow: +RAND = randdp + + +#--------------------------------------------------------------------------- +# The variable WTIME is the name of the wtime source code module in the +# NPB2.x/common directory. +# For most machines, use wtime.c +# For SGI power challenge: use wtime_sgi64.c +#--------------------------------------------------------------------------- +WTIME = wtime.cpp + + +#--------------------------------------------------------------------------- +# Enable if either Cray or IBM: +# (no such flag for most machines: see common/wtime.h) +# This is used by the C compiler to pass the machine name to common/wtime.h, +# where the C/Fortran binding interface format is determined +#--------------------------------------------------------------------------- +# MACHINE = -DCRAY +# MACHINE = -DIBM diff --git a/NPB/config/suite.def b/NPB/config/suite.def new file mode 100755 index 0000000..6b9be9d --- /dev/null +++ b/NPB/config/suite.def @@ -0,0 +1,14 @@ +# config/suite.def +# This file is used to build several benchmarks with a single command. +# Typing "make suite" in the main directory will build all the benchmarks +# specified in this file. +# Each line of this file contains a benchmark name, class, and number +# of nodes. The name is one of "cg", "is", "ep", mg", "ft" +# The class is one of "S", "W", "A", "B", and "C". +# No blank lines. +# The following example builds serial sample sizes of all benchmarks. +ft B +mg B +is B +ep B +cg B diff --git a/NPB/sys/Makefile b/NPB/sys/Makefile new file mode 100644 index 0000000..829fc85 --- /dev/null +++ b/NPB/sys/Makefile @@ -0,0 +1,21 @@ +include ../config/make.def + +# Note that COMPILE is also defined in make.common and should +# be the same. We can't include make.common because it has a lot +# of other garbage. +FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) + +all: setparams + +# setparams creates an npbparam.h file for each benchmark +# configuration. npbparams.h also contains info about how a benchmark +# was compiled and linked + +setparams: setparams.cpp ../config/make.def + $(UCC) -o setparams setparams.cpp + + +clean: + -rm -f setparams setparams.hpp npbparams.hpp + -rm -f *~ *.o + diff --git a/NPB/sys/README b/NPB/sys/README new file mode 100644 index 0000000..ede69b5 --- /dev/null +++ b/NPB/sys/README @@ -0,0 +1,41 @@ +This directory contains utilities and files used by the +build process. You should not need to change anything +in this directory. + +Original Files +-------------- +setparams.c: + Source for the setparams program. This program is used internally + in the build process to create the file "npbparams.h" for each + benchmark. npbparams.h contains Fortran or C parameters to build a + benchmark for a specific class. The setparams program is never run + directly by a user. Its invocation syntax is + + "setparams benchmark-name class". + + It examines the file "npbparams.h" in the current directory. If + the specified parameters are the same as those in the npbparams.h + file, nothing it changed. If the file does not exist or corresponds + to a different class/number of nodes, it is (re)built. + One of the more complicated things in npbparams.h is that it + contains, in a Fortran string, the compiler flags used to build a + benchmark, so that a benchmark can print out how it was compiled. + +make.common + A makefile segment that is included in each individual benchmark + program makefile. It sets up some standard macros (COMPILE, etc) + and makes sure everything is configured correctly (npbparams.h) + +Makefile + Builds setparams + +README + This file. + + +Created files +------------- + +setparams + See descriptions above + diff --git a/NPB/sys/make.common b/NPB/sys/make.common new file mode 100644 index 0000000..abb3a28 --- /dev/null +++ b/NPB/sys/make.common @@ -0,0 +1,65 @@ +PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) +FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) +CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) + +# Class "U" is used internally by the setparams program to mean +# "unknown". This means that if you don't specify CLASS= +# on the command line, you'll get an error. It would be nice +# to be able to avoid this, but we'd have to get information +# from the setparams back to the make program, which isn't easy. +CLASS=U + +default:: ${PROGRAM} + +# This makes sure the configuration utility setparams +# is up to date. +# Note that this must be run every time, which is why the +# target does not exist and is not created. +# If you create a file called "config" you will break things. +config: + @cd ../sys; ${MAKE} all + ../sys/setparams ${BENCHMARK} ${CLASS} + +COMMON=../common +${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f + cd ${COMMON}; ${FCOMPILE} ${RAND}.f + +${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp + cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp + +${COMMON}/print_results.o: ${COMMON}/print_results.f + cd ${COMMON}; ${FCOMPILE} print_results.f + +${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp + cd ${COMMON}; ${CCOMPILE} c_print_results.cpp + +${COMMON}/timers.o: ${COMMON}/timers.f + cd ${COMMON}; ${FCOMPILE} timers.f + +${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp + cd ${COMMON}; ${CCOMPILE} c_timers.cpp + +${COMMON}/wtime.o: ${COMMON}/${WTIME} + cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME} +# For most machines or CRAY or IBM +# cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c +# For a precise timer on an SGI Power Challenge, try: +# cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c + +${COMMON}/c_wtime.o: ${COMMON}/${WTIME} + cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME} + + +# Normally setparams updates npbparams.h only if the settings (CLASS) +# have changed. However, we also want to update if the compile options +# may have changed (set in ../config/make.def). +npbparams.hpp: ../config/make.def + @ echo make.def modified. Rebuilding npbparams.hpp just in case + rm -f npbparams.hpp + ../sys/setparams ${BENCHMARK} ${CLASS} + +# So that "make benchmark-name" works +${BENCHMARK}: default +${BENCHMARKU}: default + + diff --git a/NPB/sys/print_header b/NPB/sys/print_header new file mode 100644 index 0000000..7765ebd --- /dev/null +++ b/NPB/sys/print_header @@ -0,0 +1,15 @@ +echo '' +echo ' =========================================' +echo ' = NAS Parallel Benchmarks =' +echo ' = C++ DASH Versions =' +echo ' = Developed by: Dalvan Griebler =' +echo ' = Júnior Löff =' +echo ' = DASH version by: Nicco Mietzsch =' +echo ' = =' +echo ' = Warning: in case of problems =' +echo ' = send an email to us: =' +echo ' = dalvan.griebler@acad.pucrs.br =' +echo ' = junior.loff@acad.pucrs.br =' +echo ' = nicco.mietzsch@campus.lmu.de =' +echo ' =========================================' +echo '' diff --git a/NPB/sys/print_instructions b/NPB/sys/print_instructions new file mode 100644 index 0000000..bc6221f --- /dev/null +++ b/NPB/sys/print_instructions @@ -0,0 +1,18 @@ +echo '' +echo ' To make a NAS benchmark type ' +echo '' +echo ' make CLASS=' +echo '' +echo ' where is "cg", "ep", "ft", "is", or "mg"' +echo ' is "S", "W", "A", "B" or "C"' +echo '' +echo ' To make a set of benchmarks, create the file config/suite.def' +echo ' according to the instructions in config/suite.def.template and type' +echo '' +echo ' make suite' +echo '' +echo ' ***************************************************************' +echo ' * Remember to edit the file config/make.def for site specific *' +echo ' * information as described in the README file *' +echo ' ***************************************************************' + diff --git a/NPB/sys/setparams.cpp b/NPB/sys/setparams.cpp new file mode 100644 index 0000000..107ca57 --- /dev/null +++ b/NPB/sys/setparams.cpp @@ -0,0 +1,896 @@ +/* + * This utility configures a NPB to be built for a specific class. + * It creates a file "npbparams.h" + * in the source directory. This file keeps state information about + * which size of benchmark is currently being built (so that nothing + * if unnecessarily rebuilt) and defines (through PARAMETER statements) + * the number of nodes and class for which a benchmark is being built. + + * The utility takes 3 arguments: + * setparams benchmark-name class + * benchmark-name is "sp", "bt", etc + * class is the size of the benchmark + * These parameters are checked for the current benchmark. If they + * are invalid, this program prints a message and aborts. + * If the parameters are ok, the current npbsize.h (actually just + * the first line) is read in. If the new parameters are the same as + * the old, nothing is done, but an exit code is returned to force the + * user to specify (otherwise the make procedure succeeds but builds a + * binary of the wrong name). Otherwise the file is rewritten. + * Errors write a message (to stdout) and abort. + * + * This program makes use of two extra benchmark "classes" + * class "X" means an invalid specification. It is returned if + * there is an error parsing the config file. + * class "U" is an external specification meaning "unknown class" + * + * Unfortunately everything has to be case sensitive. This is + * because we can always convert lower to upper or v.v. but + * can't feed this information back to the makefile, so typing + * make CLASS=a and make CLASS=A will produce different binaries. + * + * + */ + +#include +#include +#include +#include +#include +#include + +/* + * This is the master version number for this set of + * NPB benchmarks. It is in an obscure place so people + * won't accidentally change it. + */ + +#define VERSION "4.0" + +/* controls verbose output from setparams */ +/* #define VERBOSE */ + +#define FILENAME "npbparams.hpp" +#define DESC_LINE "/* CLASS = %c */\n" +#define DEF_CLASS_LINE "#define CLASS '%c'\n" +#define FINDENT " " +#define CONTINUE " > " + +void get_info(char *argv[], int *typep, char *classp); +void check_info(int type, char class_npb); +void read_info(int type, char *classp); +void write_info(int type, char class_npb); +void write_sp_info(FILE *fp, char class_npb); +void write_bt_info(FILE *fp, char class_npb); +void write_dc_info(FILE *fp, char class_npb); +void write_lu_info(FILE *fp, char class_npb); +void write_mg_info(FILE *fp, char class_npb); +void write_cg_info(FILE *fp, char class_npb); +void write_ft_info(FILE *fp, char class_npb); +void write_ep_info(FILE *fp, char class_npb); +void write_is_info(FILE *fp, char class_npb); +void write_compiler_info(int type, FILE *fp); +void write_convertdouble_info(int type, FILE *fp); +void check_line(char *line, char *label, char *val); +int check_include_line(char *line, char *filename); +void put_string(FILE *fp, char *name, char *val); +void put_def_string(FILE *fp, char *name, char *val); +void put_def_variable(FILE *fp, char *name, char *val); +int ilog2(int i); + +enum benchmark_types {SP, BT, LU, MG, FT, IS, EP, CG, DC}; + +main(int argc, char *argv[]) +{ + int type; + char class_npb, class_old; + + if (argc != 3) { + printf("Usage: %s benchmark-name class_npb\n", argv[0]); + exit(1); + } + + /* Get command line arguments. Make sure they're ok. */ + get_info(argv, &type, &class_npb); + if (class_npb != 'U') { +#ifdef VERBOSE + printf("setparams: For benchmark %s: class_npb = %c\n", + argv[1], class_npb); +#endif + check_info(type, class_npb); + } + + /* Get old information. */ + read_info(type, &class_old); + if (class_npb != 'U') { + if (class_old != 'X') { +#ifdef VERBOSE + printf("setparams: old settings: class_npb = %c\n", + class_old); +#endif + } + } else { + printf("setparams:\n\ + *********************************************************************\n\ + * You must specify CLASS to build this benchmark *\n\ + * For example, to build a class A benchmark, type *\n\ + * make {benchmark-name} CLASS=A *\n\ + *********************************************************************\n\n"); + + if (class_old != 'X') { +#ifdef VERBOSE + printf("setparams: Previous settings were CLASS=%c \n", class_old); +#endif + } + exit(1); /* exit on class==U */ + } + + /* Write out new information if it's different. */ + if (class_npb != class_old) { +#ifdef VERBOSE + printf("setparams: Writing %s\n", FILENAME); +#endif + write_info(type, class_npb); + } else { +#ifdef VERBOSE + printf("setparams: Settings unchanged. %s unmodified\n", FILENAME); +#endif + } + + exit(0); +} + + +/* + * get_info(): Get parameters from command line + */ + +void get_info(char *argv[], int *typep, char *classp) +{ + + *classp = *argv[2]; + + if (!strcmp(argv[1], "sp") || !strcmp(argv[1], "SP")) *typep = SP; + else if (!strcmp(argv[1], "bt") || !strcmp(argv[1], "BT")) *typep = BT; + else if (!strcmp(argv[1], "ft") || !strcmp(argv[1], "FT")) *typep = FT; + else if (!strcmp(argv[1], "lu") || !strcmp(argv[1], "LU")) *typep = LU; + else if (!strcmp(argv[1], "mg") || !strcmp(argv[1], "MG")) *typep = MG; + else if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS; + else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP; + else if (!strcmp(argv[1], "cg") || !strcmp(argv[1], "CG")) *typep = CG; + else if (!strcmp(argv[1], "dc") || !strcmp(argv[1], "DC")) *typep = DC; + else { + printf("setparams: Error: unknown benchmark type %s\n", argv[1]); + exit(1); + } +} + +/* + * check_info(): Make sure command line data is ok for this benchmark + */ + +void check_info(int type, char class_npb) +{ + int tmplog; + + /* check class_npb */ + if (class_npb != 'S' && + class_npb != 'A' && + class_npb != 'B' && + class_npb != 'R' && + class_npb != 'W' && + class_npb != 'C') { + printf("setparams: Unknown benchmark class_npb %c\n", class_npb); + printf("setparams: Allowed classes are \"S\", \"A\", \"B\" and \"C\"\n"); + exit(1); + } + +} + + +/* + * read_info(): Read previous information from file. + * Not an error if file doesn't exist, because this + * may be the first time we're running. + * Assumes the first line of the file is in a special + * format that we understand (since we wrote it). + */ + +void read_info(int type, char *classp) +{ + int nread, gotem = 0; + char line[200]; + FILE *fp; + fp = fopen(FILENAME, "r"); + if (fp == NULL) { +#ifdef VERBOSE + printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME); +#endif + goto abort; + } + + /* first line of file contains info (fortran), first two lines (C) */ + + switch(type) { + case SP: + case BT: + case FT: + case MG: + case LU: + case EP: + case CG: + nread = fscanf(fp, DESC_LINE, classp); + if (nread != 1) { + printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME); + goto abort; + } + break; + case IS: + case DC: + nread = fscanf(fp, DEF_CLASS_LINE, classp); + if (nread != 1) { + printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME); + goto abort; + } + break; + default: + /* never should have gotten this far with a bad name */ + printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); + exit(1); + } + + normal_return: + *classp = *classp; + fclose(fp); + + + return; + + abort: + *classp = 'X'; + return; +} + + +/* + * write_info(): Write new information to config file. + * First line is in a special format so we can read + * it in again. Then comes a warning. The rest is all + * specific to a particular benchmark. + */ + +void write_info(int type, char class_npb) +{ + FILE *fp; + fp = fopen(FILENAME, "w"); + if (fp == NULL) { + printf("setparams: Can't open file %s for writing\n", FILENAME); + exit(1); + } + + switch(type) { + case SP: + case BT: + case FT: + case MG: + case LU: + case EP: + case CG: + /* Write out the header */ + fprintf(fp, DESC_LINE, class_npb); + /* Print out a warning so bozos don't mess with the file */ + fprintf(fp, "\ +/*\n\ +c This file is generated automatically by the setparams utility.\n\ +c It sets the number of processors and the class_npb of the NPB\n\ +c in this directory. Do not modify it by hand.\n\ +*/\n"); + + break; + case IS: + fprintf(fp, DEF_CLASS_LINE, class_npb); + fprintf(fp, "\ +/*\n\ + This file is generated automatically by the setparams utility.\n\ + It sets the number of processors and the class of the NPB\n\ + in this directory. Do not modify it by hand. */\n\ + \n"); + break; + case DC: + fprintf(fp, DEF_CLASS_LINE, class_npb); + fprintf(fp, "\ +/*\n\ + This file is generated automatically by the setparams utility.\n\ + It sets the number of processors and the class of the NPB\n\ + in this directory. Do not modify it by hand.\n\ + This file provided for backward compatibility.\n\ + It is not used in DC benchmark. */\n\ + \n"); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", + type); + exit(1); + } + + /* Now do benchmark-specific stuff */ + switch(type) { + case SP: + write_sp_info(fp, class_npb); + break; + case BT: + write_bt_info(fp, class_npb); + break; + case DC: + write_dc_info(fp, class_npb); + break; + case LU: + write_lu_info(fp, class_npb); + break; + case MG: + write_mg_info(fp, class_npb); + break; + case IS: + write_is_info(fp, class_npb); + break; + case FT: + write_ft_info(fp, class_npb); + break; + case EP: + write_ep_info(fp, class_npb); + break; + case CG: + write_cg_info(fp, class_npb); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", type); + exit(1); + } + write_convertdouble_info(type, fp); + write_compiler_info(type, fp); + fclose(fp); + return; +} + + +/* + * write_sp_info(): Write SP specific info to config file + */ + +void write_sp_info(FILE *fp, char class_npb) +{ + int problem_size, niter; + const char *dt; + if (class_npb == 'S') { problem_size = 12; dt = "0.015"; niter = 100; } + else if (class_npb == 'W') { problem_size = 36; dt = "0.0015"; niter = 400; } + else if (class_npb == 'A') { problem_size = 64; dt = "0.0015"; niter = 400; } + else if (class_npb == 'B') { problem_size = 102; dt = "0.001"; niter = 400; } + else if (class_npb == 'C') { problem_size = 162; dt = "0.00067"; niter = 400; } + else { + printf("setparams: Internal error: invalid class_npb %c\n", class_npb); + exit(1); + } + fprintf(fp, "#define\tPROBLEM_SIZE\t%d\n", problem_size); + fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter); + fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt); +} + +/* + * write_bt_info(): Write BT specific info to config file + */ + +void write_bt_info(FILE *fp, char class_npb) +{ + int problem_size, niter; + const char *dt; + if (class_npb == 'S') { problem_size = 12; dt = "0.010"; niter = 60; } + else if (class_npb == 'W') { problem_size = 24; dt = "0.0008"; niter = 200; } + else if (class_npb == 'A') { problem_size = 64; dt = "0.0008"; niter = 200; } + else if (class_npb == 'B') { problem_size = 102; dt = "0.0003"; niter = 200; } + else if (class_npb == 'C') { problem_size = 162; dt = "0.0001"; niter = 200; } + else { + printf("setparams: Internal error: invalid class_npb %c\n", class_npb); + exit(1); + } + fprintf(fp, "#define\tPROBLEM_SIZE\t%d\n", problem_size); + fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter); + fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt); +} + +/* + * write_dc_info(): Write DC specific info to config file + */ + + +void write_dc_info(FILE *fp, char class_npb) +{ + long int input_tuples, attrnum; + if (class_npb == 'S') { input_tuples = 1000; attrnum = 5; } + else if (class_npb == 'W') { input_tuples = 100000; attrnum = 10; } + else if (class_npb == 'A') { input_tuples = 1000000; attrnum = 15; } + else if (class_npb == 'B') { input_tuples = 10000000; attrnum = 20; } + else { + printf("setparams: Internal error: invalid class_npb %c\n", class_npb); + exit(1); + } + fprintf(fp, "long long int input_tuples=%ld, attrnum=%ld;\n", + input_tuples, attrnum); +} + +/* + * write_lu_info(): Write SP specific info to config file + */ + +void write_lu_info(FILE *fp, char class_npb) +{ + int isiz1, isiz2, itmax, inorm, problem_size; + int xdiv, ydiv; /* number of cells in x and y direction */ + const char *dt_default; + + if (class_npb == 'S') { problem_size = 12; dt_default = "0.5"; itmax = 50; } + else if (class_npb == 'W') { problem_size = 33; dt_default = "1.5e-3"; itmax = 300; } + else if (class_npb == 'A') { problem_size = 64; dt_default = "2.0"; itmax = 250; } + else if (class_npb == 'B') { problem_size = 102; dt_default = "2.0"; itmax = 250; } + else if (class_npb == 'C') { problem_size = 162; dt_default = "2.0"; itmax = 250; } + else { + printf("setparams: Internal error: invalid class_npb %c\n", class_npb); + exit(1); + } + inorm = itmax; + isiz1 = problem_size; + isiz2 = problem_size; + + fprintf(fp, "\n/* full problem size */\n"); + fprintf(fp, "#define\tISIZ1\t%d\n", problem_size); + fprintf(fp, "#define\tISIZ2\t%d\n", problem_size); + fprintf(fp, "#define\tISIZ3\t%d\n", problem_size); + + fprintf(fp, "/* number of iterations and how often to print the norm */\n"); + fprintf(fp, "#define\tITMAX_DEFAULT\t%d\n", itmax); + fprintf(fp, "#define\tINORM_DEFAULT\t%d\n", inorm); + + fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt_default); +} + +/* + * write_mg_info(): Write MG specific info to config file + */ + +void write_mg_info(FILE *fp, char class_npb) +{ + int problem_size, nit, log2_size, lt_default, lm; + int ndim1, ndim2, ndim3; + if (class_npb == 'S') { problem_size = 32; nit = 4; } + else if (class_npb == 'W') { problem_size = 64; nit = 40; } + else if (class_npb == 'A') { problem_size = 256; nit = 4; } + else if (class_npb == 'B') { problem_size = 256; nit = 20; } + else if (class_npb == 'C') { problem_size = 512; nit = 20; } + else { + printf("setparams: Internal error: invalid class_npb type %c\n", class_npb); + exit(1); + } + log2_size = ilog2(problem_size); + /* lt is log of largest total dimension */ + lt_default = log2_size; + /* log of log of maximum dimension on a node */ + lm = log2_size; + ndim1 = lm; + ndim3 = log2_size; + ndim2 = log2_size; + + fprintf(fp, "#define\tNX_DEFAULT\t%d\n", problem_size); + fprintf(fp, "#define\tNY_DEFAULT\t%d\n", problem_size); + fprintf(fp, "#define\tNZ_DEFAULT\t%d\n", problem_size); + fprintf(fp, "#define\tNIT_DEFAULT\t%d\n", nit); + fprintf(fp, "#define\tLM\t%d\n", lm); + fprintf(fp, "#define\tLT_DEFAULT\t%d\n", lt_default); + fprintf(fp, "#define\tDEBUG_DEFAULT\t%d\n", 0); + fprintf(fp, "#define\tNDIM1\t%d\n", ndim1); + fprintf(fp, "#define\tNDIM2\t%d\n", ndim2); + fprintf(fp, "#define\tNDIM3\t%d\n", ndim3); +} + + +/* + * write_is_info(): Write IS specific info to config file + */ + +void write_is_info(FILE *fp, char class_npb) +{ + int m1, m2, m3 ; + if( class_npb != 'S' && + class_npb != 'W' && + class_npb != 'A' && + class_npb != 'B' && + class_npb != 'C') + { + printf("setparams: Internal error: invalid class_npb type %c\n", class_npb); + exit(1); + } +} + + +/* + * write_cg_info(): Write CG specific info to config file + */ + +void write_cg_info(FILE *fp, char class_npb) +{ + int na,nonzer,niter; + const char *shift,*rcond="1.0e-1"; + const char *shiftS="10.0", + *shiftW="12.0", + *shiftA="20.0", + *shiftB="60.0", + *shiftC="110.0"; + + + if( class_npb == 'S' ) + { na=1400; nonzer=7; niter=15; shift=shiftS; } + else if( class_npb == 'W' ) + { na=7000; nonzer=8; niter=15; shift=shiftW; } + else if( class_npb == 'A' ) + { na=14000; nonzer=11; niter=15; shift=shiftA; } + else if( class_npb == 'B' ) + { na=75000; nonzer=13; niter=75; shift=shiftB; } + else if( class_npb == 'C' ) + { na=150000; nonzer=15; niter=75; shift=shiftC; } + else + { + printf("setparams: Internal error: invalid class_npb type %c\n", class_npb); + exit(1); + } + fprintf( fp, "#define\tNA\t%d\n", na); + fprintf( fp, "#define\tNONZER\t%d\n", nonzer); + fprintf( fp, "#define\tNITER\t%d\n", niter); + fprintf( fp, "#define\tSHIFT\t%s\n", shift); + fprintf( fp, "#define\tRCOND\t%s\n", rcond); +} + + +/* + * write_ft_info(): Write FT specific info to config file + */ + +void write_ft_info(FILE *fp, char class_npb) +{ + /* easiest way (given the way the benchmark is written) + * is to specify log of number of grid points in each + * direction m1, m2, m3. nt is the number of iterations + */ + int nx, ny, nz, maxdim, niter, np_min; + if (class_npb == 'S') { nx = 64; ny = 64; nz = 64; niter = 6;} + else if (class_npb == 'W') { nx = 128; ny = 128; nz = 32; niter = 6;} + else if (class_npb == 'A') { nx = 256; ny = 256; nz = 128; niter = 6;} + else if (class_npb == 'B') { nx = 512; ny = 256; nz = 256; niter =20;} + else if (class_npb == 'C') { nx = 512; ny = 512; nz = 512; niter =20;} + else { + printf("setparams: Internal error: invalid class_npb type %c\n", class_npb); + exit(1); + } + maxdim = nx; + if (ny > maxdim) maxdim = ny; + if (nz > maxdim) maxdim = nz; + fprintf(fp, "#define\tNX\t%d\n", nx); + fprintf(fp, "#define\tNY\t%d\n", ny); + fprintf(fp, "#define\tNZ\t%d\n", nz); + fprintf(fp, "#define\tMAXDIM\t%d\n", maxdim); + fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter); + fprintf(fp, "#define\tNTOTAL\t%d\n", nx*ny*nz); +} + +/* + * write_ep_info(): Write EP specific info to config file + */ + +void write_ep_info(FILE *fp, char class_npb) +{ + /* easiest way (given the way the benchmark is written) + * is to specify log of number of grid points in each + * direction m1, m2, m3. nt is the number of iterations + */ + int m; + if (class_npb == 'S') { m = 24; } + else if (class_npb == 'W') { m = 25; } + else if (class_npb == 'A') { m = 28; } + else if (class_npb == 'B') { m = 30; } + else if (class_npb == 'C') { m = 32; } + else { + printf("setparams: Internal error: invalid class_npb type %c\n", class_npb); + exit(1); + } + + fprintf(fp, "#define\tCLASS\t \'%c\'\n", class_npb); + fprintf(fp, "#define\tM\t%d\n", m); +} + + +/* + * This is a gross hack to allow the benchmarks to + * print out how they were compiled. Various other ways + * of doing this have been tried and they all fail on + * some machine - due to a broken "make" program, or + * F77 limitations, of whatever. Hopefully this will + * always work because it uses very portable C. Unfortunately + * it relies on parsing the make.def file - YUK. + * If your machine doesn't have or , happy hacking! + * + */ + +#define VERBOSE +#define LL 400 +#include +#define DEFFILE "../config/make.def" +#define DEFAULT_MESSAGE "(none)" +void write_compiler_info(int type, FILE *fp) +{ + FILE *deffile; + char line[LL]; + char f77[LL], flink[LL], f_lib[LL], f_inc[LL], fflags[LL], flinkflags[LL]; + char compiletime[LL], randfile[LL]; + char cc[LL], cflags[LL], clink[LL], clinkflags[LL], + c_lib[LL], c_inc[LL]; + struct tm *tmp; + time_t t; + deffile = fopen(DEFFILE, "r"); + if (deffile == NULL) { + printf("\n\ +setparams: File %s doesn't exist. To build the NAS benchmarks\n\ + you need to create is according to the instructions\n\ + in the README in the main directory and comments in \n\ + the file config/make.def.template\n", DEFFILE); + exit(1); + } + strcpy(f77, DEFAULT_MESSAGE); + strcpy(flink, DEFAULT_MESSAGE); + strcpy(f_lib, DEFAULT_MESSAGE); + strcpy(f_inc, DEFAULT_MESSAGE); + strcpy(fflags, DEFAULT_MESSAGE); + strcpy(flinkflags, DEFAULT_MESSAGE); + strcpy(randfile, DEFAULT_MESSAGE); + strcpy(cc, DEFAULT_MESSAGE); + strcpy(cflags, DEFAULT_MESSAGE); + strcpy(clink, DEFAULT_MESSAGE); + strcpy(clinkflags, DEFAULT_MESSAGE); + strcpy(c_lib, DEFAULT_MESSAGE); + strcpy(c_inc, DEFAULT_MESSAGE); + + while (fgets(line, LL, deffile) != NULL) { + if (*line == '#') continue; + /* yes, this is inefficient. but it's simple! */ + check_line(line, (char*)"F77", f77); + check_line(line, (char*)"FLINK", flink); + check_line(line, (char*)"F_LIB", f_lib); + check_line(line, (char*)"F_INC", f_inc); + check_line(line, (char*)"FFLAGS", fflags); + check_line(line, (char*)"FLINKFLAGS", flinkflags); + check_line(line, (char*)"RAND", randfile); + check_line(line, (char*)"CC", cc); + check_line(line, (char*)"CFLAGS", cflags); + check_line(line, (char*)"CLINK", clink); + check_line(line, (char*)"CLINKFLAGS", clinkflags); + check_line(line, (char*)"C_LIB",c_lib); + check_line(line, (char*)"C_INC", c_inc); + } + + + (void) time(&t); + tmp = localtime(&t); + (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp); + + + switch(type) { + case FT: + case SP: + case BT: + case MG: + case LU: + case EP: + case CG: + put_def_string(fp, (char*)"COMPILETIME", compiletime); + put_def_string(fp, (char*)"NPBVERSION", (char*)VERSION); + put_def_string(fp, (char*)"CS1", cc); + put_def_string(fp, (char*)"CS2", clink); + put_def_string(fp, (char*)"CS3", c_lib); + put_def_string(fp, (char*)"CS4", c_inc); + put_def_string(fp, (char*)"CS5", cflags); + put_def_string(fp, (char*)"CS6", clinkflags); + put_def_string(fp, (char*)"CS7", randfile); + break; + case IS: + case DC: + put_def_string(fp, (char*)"COMPILETIME", compiletime); + put_def_string(fp, (char*)"NPBVERSION", (char*)VERSION); + put_def_string(fp, (char*)"CC", cc); + put_def_string(fp, (char*)"CFLAGS", cflags); + put_def_string(fp, (char*)"CLINK", clink); + put_def_string(fp, (char*)"CLINKFLAGS", clinkflags); + put_def_string(fp, (char*)"C_LIB", c_lib); + put_def_string(fp, (char*)"C_INC", c_inc); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", + type); + exit(1); + } + +} + +void check_line(char *line, char *label, char *val) +{ + char *original_line; + original_line = line; + /* compare beginning of line and label */ + while (*label != '\0' && *line == *label) { + line++; label++; + } + /* if *label is not EOS, we must have had a mismatch */ + if (*label != '\0') return; + /* if *line is not a space, actual label is longer than test label */ + if (!isspace(*line) && *line != '=') return ; + /* skip over white space */ + while (isspace(*line)) line++; + /* next char should be '=' */ + if (*line != '=') return; + /* skip over white space */ + while (isspace(*++line)); + /* if EOS, nothing was specified */ + if (*line == '\0') return; + /* finally we've come to the value */ + strcpy(val, line); + /* chop off the newline at the end */ + val[strlen(val)-1] = '\0'; + if (val[strlen(val) - 1] == '\\') { + printf("\n\ +setparams: Error in file make.def. Because of the way in which\n\ + command line arguments are incorporated into the\n\ + executable benchmark, you can't have any continued\n\ + lines in the file make.def, that is, lines ending\n\ + with the character \"\\\". Although it may be ugly, \n\ + you should be able to reformat without continuation\n\ + lines. The offending line is\n\ + %s\n", original_line); + exit(1); + } +} + +int check_include_line(char *line, char *filename) +{ + char *include_string = (char*)"include"; + /* compare beginning of line and "include" */ + while (*include_string != '\0' && *line == *include_string) { + line++; include_string++; + } + /* if *include_string is not EOS, we must have had a mismatch */ + if (*include_string != '\0') return(0); + /* if *line is not a space, first word is not "include" */ + if (!isspace(*line)) return(0); + /* skip over white space */ + while (isspace(*++line)); + /* if EOS, nothing was specified */ + if (*line == '\0') return(0); + /* next keyword should be name of include file in *filename */ + while (*filename != '\0' && *line == *filename) { + line++; filename++; + } + if (*filename != '\0' || + (*line != ' ' && *line != '\0' && *line !='\n')) return(0); + else return(1); +} + + +#define MAXL 146 +void put_string(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name); + fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val); +} + +/* NOTE: is the ... stuff necessary in C? */ +void put_def_string(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "#define %s \"%s\"\n", name, val); +} + +void put_def_variable(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "#define %s %s\n", name, val); +} + + + +#if 0 + +/* this version allows arbitrarily long lines but + * some compilers don't like that and they're rarely + * useful + */ + +#define LINELEN 65 +void put_string(FILE *fp, char *name, char *val) +{ + int len, nlines, pos, i; + char line[100]; + len = strlen(val); + nlines = len/LINELEN; + if (nlines*LINELEN < len) nlines++; + fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name); + fprintf(fp, "%sparameter (%s = \n", FINDENT, name); + for (i = 0; i < nlines; i++) { + pos = i*LINELEN; + if (i == 0) fprintf(fp, "%s\'", CONTINUE); + else fprintf(fp, "%s", CONTINUE); + /* number should be same as LINELEN */ + fprintf(fp, "%.65s", val+pos); + if (i == nlines-1) fprintf(fp, "\')\n"); + else fprintf(fp, "\n"); + } +} + +#endif + + +/* integer log base two. Return error is argument isn't + * a power of two or is less than or equal to zero + */ + +int ilog2(int i) +{ + int log2; + int exp2 = 1; + if (i <= 0) return(-1); + + for (log2 = 0; log2 < 20; log2++) { + if (exp2 == i) return(log2); + exp2 *= 2; + } + return(-1); +} + + + +void write_convertdouble_info(int type, FILE *fp) +{ + switch(type) { + case SP: + case BT: + case LU: + case FT: + case MG: + case EP: + case CG: +#ifdef CONVERTDOUBLE + fprintf(fp, "#define\tCONVERTDOUBLE\tTRUE\n"); +#else + fprintf(fp, "#define\tCONVERTDOUBLE\tFALSE\n"); +#endif + break; + } +}