diff --git a/NPB/CG/Makefile b/NPB/CG/Makefile
new file mode 100644
index 0000000..688d3b8
--- /dev/null
+++ b/NPB/CG/Makefile
@@ -0,0 +1,20 @@
+SHELL=/bin/sh
+BENCHMARK=cg
+BENCHMARKU=CG
+
+include ../config/make.def
+
+OBJS = cg.o ${COMMON}/c_print_results.o  \
+       ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS}
+
+cg.o:		cg.cpp  npbparams.hpp
+	${CCOMPILE} cg.cpp
+
+clean:
+	- rm -f *.o *~
+	- rm -f npbparams.hpp core
diff --git a/NPB/CG/cg.cpp b/NPB/CG/cg.cpp
new file mode 100644
index 0000000..ddeb6f3
--- /dev/null
+++ b/NPB/CG/cg.cpp
@@ -0,0 +1,1055 @@
+/*--------------------------------------------------------------------
+
+  Information on NAS Parallel Benchmarks is available at:
+
+  http://www.nas.nasa.gov/Software/NPB/
+
+  Authors: M. Yarrow
+  C. Kuszmaul
+
+  STL version:
+  Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>
+
+  CPP and OpenMP version:
+      Dalvan Griebler <dalvangriebler@gmail.com>
+      Júnior Löff <loffjh@gmail.com>
+
+--------------------------------------------------------------------*/
+
+/*
+c---------------------------------------------------------------------
+c  Note: please observe that in the routine conj_grad three
+c  implementations of the sparse matrix-vector multiply have
+c  been supplied.  The default matrix-vector multiply is not
+c  loop unrolled.  The alternate implementations are unrolled
+c  to a depth of 2 and unrolled to a depth of 8.  Please
+c  experiment with these to find the fastest for your particular
+c  architecture.  If reporting timing results, any of these three may
+c  be used without penalty.
+c---------------------------------------------------------------------
+*/
+#include <libdash.h>
+
+#include <iostream>
+#include "npbparams.hpp"
+#include "npb-CPP.hpp"
+
+#define  NZ  NA*(NONZER+1)*(NONZER+1)+NA*(NONZER+2)
+
+#define TIMER_ENABLED FALSE
+
+/* global variables */
+
+/* common /partit_size/ */
+static int naa;
+static int nzz;
+static int firstrow;
+static int lastrow;
+static int firstcol;
+static int lastcol;
+
+/* common /urando/ */
+static double amult;
+static double tran;
+
+/* function declarations */
+static void conj_grad (dash::Array<int, int, dash::CSRPattern<1>> &colidx, dash::Array<int> &rowstr, dash::Array<double> &x,
+  dash::Array<double> &z, dash::Array<double, int, dash::CSRPattern<1>> &al, dash::Array<double> &p, dash::Array<double> &q,
+  dash::Array<double> &r, dash::Array<double> &w, double *rnorm);
+static void makea(int n, int nz, double a[], int colidx[], int rowstr[],
+  int nonzer, int firstrow, int lastrow, int firstcol,
+  int lastcol, double rcond, int arow[], int acol[],
+  double aelt[], double v[], int iv[], double shift );
+static void sparse(double a[], int colidx[], int rowstr[], int n,
+  int arow[], int acol[], double aelt[],
+  int firstrow, int lastrow,
+  double x[], boolean mark[], int nzloc[], int nnza);
+static void sprnvc(int n, int nz, double v[], int iv[], int nzloc[], int mark[]);
+static int icnvrt(double x, int ipwr2);
+static void vecset(int n, double v[], int iv[], int *nzv, int i, double val);
+
+/*--------------------------------------------------------------------
+      program cg
+--------------------------------------------------------------------*/
+
+int main(int argc, char **argv)
+{
+
+  dash::init(&argc, &argv);
+
+  dash::Array<double> a_global(NZ+1);
+  dash::Array<int> rowstr_global(NA+1+1);
+  dash::Array<int> colidx_global(NZ+1);
+
+  dash::Array<double> x(NA+1);  /* x[1:NA] */
+  dash::Array<double> z(NA+1);  /* z[1:NA] */
+  dash::Array<double> p(NA+1);  /* p[1:NA] */
+  dash::Array<double> q(NA+1);  /* q[1:NA] */
+  dash::Array<double> r(NA+1);  /* r[1:NA] */
+  dash::Array<double> w(NA+1);  /* w[1:NA] */
+
+
+  int i, j, k, it;
+  int nthreads = dash::size();
+  double zeta;
+  double rnorm;
+  double norm_temp11;
+  double norm_temp12;
+  double t, mflops;
+  char class_npb;
+  boolean verified;
+  double zeta_verify_value, epsilon;
+
+  firstrow = 1;
+  lastrow  = NA;
+  firstcol = 1;
+  lastcol  = NA;
+
+  if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) {
+    class_npb = 'S';
+    zeta_verify_value = 8.5971775078648;
+  } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) {
+    class_npb = 'W';
+    zeta_verify_value = 10.362595087124;
+  } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) {
+    class_npb = 'A';
+    zeta_verify_value = 17.130235054029;
+  } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) {
+    class_npb = 'B';
+    zeta_verify_value = 22.712745482631;
+  } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) {
+    class_npb = 'C';
+    zeta_verify_value = 28.973605592845;
+  } else {
+    class_npb = 'U';
+  }
+
+  if ( 0 == dash::myid() ) {
+
+    printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version"" - CG Benchmark\n");
+    printf("\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n");
+    printf("\n\n DASH version by: Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>\n");
+    printf(" Size: %10d\n", NA);
+    printf(" Iterations: %5d\n", NITER);
+
+
+    naa = NA;
+    nzz = NZ;
+
+    /*--------------------------------------------------------------------
+    c  Initialize random number generator
+    c-------------------------------------------------------------------*/
+    tran    = 314159265.0;
+    amult   = 1220703125.0;
+    zeta    = randlc( &tran, amult );
+
+    /*--------------------------------------------------------------------
+    c
+    c-------------------------------------------------------------------*/
+    static int colidx_init[NZ+1];  /* colidx[1:NZ] */
+    static int rowstr_init[NA+1+1];  /* rowstr[1:NA+1] */
+
+    static double a_init[NZ+1];    /* a[1:NZ] */
+
+
+
+    static int iv[2*NA+1+1];  /* iv[1:2*NA+1] */
+    static int arow[NZ+1];    /* arow[1:NZ] */
+    static int acol[NZ+1];    /* acol[1:NZ] */
+
+    static double v[NA+1+1];  /* v[1:NA+1] */
+    static double aelt[NZ+1];  /* aelt[1:NZ] */
+
+
+    makea(naa, nzz, a_init, colidx_init, rowstr_init, NONZER,
+    firstrow, lastrow, firstcol, lastcol,
+    RCOND, arow, acol, aelt, v, iv, SHIFT);
+
+    for (i = 0; i < NZ+1; ++i) colidx_global[i] = colidx_init[i];
+    for (i = 0; i < NA+1+1; ++i) rowstr_global[i] = rowstr_init[i];
+    for (i = 0; i < NZ+1; ++i) a_global[i] = a_init[i];
+
+
+    /*--------------------------------------------------------------------
+    c  set starting vector to (1, 1, .... 1)
+    c-------------------------------------------------------------------*/
+
+    for (i = 1; i <= NA; i++) {
+      x[i] = 1.0;
+    }
+
+  }
+  dash::barrier();
+
+  //setup local arrays
+  // first, setup rowstr
+
+  int elem_per_unit = ceil(((double) NA+1)/dash::size());
+
+  dash::Array<int> rowstr((elem_per_unit+1)*dash::size());
+
+  for (int i = 0; i < elem_per_unit+1; i++) {
+    rowstr.local[i] = rowstr_global[min(dash::myid()*elem_per_unit + i, NA+1)];
+  }
+
+
+  //now setup a and colidx
+
+  using pattern_t = dash::CSRPattern<1>;
+  using index_t = int;
+  using extent_t = typename pattern_t::size_type;
+
+  using array_double = dash::Array<double, index_t, pattern_t>;
+  using array_int = dash::Array<int, index_t, pattern_t>;
+
+  std::vector<extent_t> local_sizes;
+  dash::Array<int> my_elem_count(dash::size());
+
+  my_elem_count.local[0] = rowstr.local[elem_per_unit] - rowstr.local[0];
+
+  for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) {
+    local_sizes.push_back(my_elem_count[unit_idx]);
+  }
+
+  pattern_t pattern(local_sizes);
+
+  array_double a(pattern);
+  array_int colidx(pattern);
+
+  int offset = 0;
+  for (int k = 0; k < dash::myid(); ++k) offset += my_elem_count[k];
+
+  for (int i = 0; i < my_elem_count.local[0]; ++i) {
+    a.local[i] = a_global[offset + i];
+    colidx.local[i] = colidx_global[offset + i];
+  }
+
+  for (int i = 0; i < elem_per_unit+1; ++i) {
+    rowstr.local[i] -= offset;
+  }
+
+  dash::barrier();
+
+  zeta = 0.0;
+
+  /*-------------------------------------------------------------------
+  c---->
+  c  Do one iteration untimed to init all code and data page tables
+  c---->                    (then reinit, start timing, to niter its)
+  c-------------------------------------------------------------------*/
+
+  for (it = 1; it <= 1; it++) {
+
+    /*--------------------------------------------------------------------
+    c  The call to the conjugate gradient routine:
+    c-------------------------------------------------------------------*/
+    conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm);
+
+    /*--------------------------------------------------------------------
+    c  zeta = shift + 1/(x.z)
+    c  So, first: (x.z)
+    c  Also, find norm of z
+    c  So, first: (z.z)
+    c-------------------------------------------------------------------*/
+    dash::Array<double> norm_temp11(dash::size());
+
+    auto it2 = z.lbegin();
+    for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) {
+      norm_temp11.local[0] += *it1 * *it2;
+      it2++;
+    }
+
+    norm_temp11.local[0] = dash::reduce(norm_temp11.begin(), norm_temp11.end(), 0.0, std::plus<double>());
+
+    dash::Array<double> norm_temp12(dash::size());
+
+    for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) {
+      norm_temp12.local[0] += *it1 * *it1;
+    }
+
+    norm_temp12.local[0] = dash::reduce(norm_temp12.begin(), norm_temp12.end(), 0.0, std::plus<double>());
+
+    norm_temp12.local[0] = 1.0 / sqrt( norm_temp12.local[0] );
+
+    /*--------------------------------------------------------------------
+    c  Normalize z to obtain x
+    c-------------------------------------------------------------------*/
+
+    it2 = z.lbegin();
+    for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) {
+      *it1 = *it2 * norm_temp12.local[0];
+      it2++;
+    }
+
+  } /* end of do one iteration untimed */
+
+  /*--------------------------------------------------------------------
+  c  set starting vector to (1, 1, .... 1)
+  c-------------------------------------------------------------------*/
+  if (0 == dash::myid()) {
+
+    for (i = 1; i <= NA; i++) {
+      x[i] = 1.0;
+    }
+
+  }
+
+  zeta = 0.0;
+
+  dash::barrier();
+
+  if (0 == dash::myid()) {
+
+    timer_clear( 1 );
+    timer_clear( 2 );
+
+    timer_start( 1 );
+
+  }
+
+  /*--------------------------------------------------------------------
+  c---->
+  c  Main Iteration for inverse power method
+  c---->
+  c-------------------------------------------------------------------*/
+
+
+  for (it = 1; it <= NITER; it++) {
+
+    /*--------------------------------------------------------------------
+    c  The call to the conjugate gradient routine:
+    c-------------------------------------------------------------------*/
+    conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm);
+    /*--------------------------------------------------------------------
+    c  zeta = shift + 1/(x.z)
+    c  So, first: (x.z)
+    c  Also, find norm of z
+    c  So, first: (z.z)
+    c-------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    dash::Array<double> norm_temp11(dash::size());
+
+    auto it2 = z.lbegin();
+    for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) {
+      norm_temp11.local[0] += *it1 * *it2;
+      it2++;
+    }
+
+    norm_temp11.local[0] = dash::reduce(norm_temp11.begin(), norm_temp11.end(), 0.0, std::plus<double>());
+
+    dash::Array<double> norm_temp12(dash::size());
+
+    for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) {
+      norm_temp12.local[0] += *it1 * *it1;
+    }
+
+    norm_temp12.local[0] = dash::reduce(norm_temp12.begin(), norm_temp12.end(), 0.0, std::plus<double>());
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+    norm_temp12.local[0] = 1.0 / sqrt( norm_temp12.local[0] );
+
+    if ( 0 == dash::myid() ) {
+
+      zeta = SHIFT + 1.0 / norm_temp11.local[0];
+
+      if ( it == 1 ) {
+        printf("   iteration           ||r||                 zeta\n");
+      }
+
+      printf("    %5d       %20.14e%20.13e\n", it, rnorm, zeta);
+
+    }
+
+    /*--------------------------------------------------------------------
+    c  Normalize z to obtain x
+    c-------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    it2 = z.lbegin();
+    for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) {
+      *it1 = *it2 * norm_temp12.local[0];
+      it2++;
+    }
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+    /* end of main iter inv pow meth */
+
+  }
+
+  dash::barrier();
+
+  if (0 == dash::myid()) {
+
+
+    timer_stop( 1 );
+    /*--------------------------------------------------------------------
+    c  End of timed section
+    c-------------------------------------------------------------------*/
+
+    t = timer_read( 1 );
+
+    printf(" Benchmark completed\n");
+
+    epsilon = 1.0e-10;
+    if (class_npb != 'U') {
+      if (fabs(zeta - zeta_verify_value) <= epsilon) {
+        verified = TRUE;
+        printf(" VERIFICATION SUCCESSFUL\n");
+        printf(" Zeta is    %20.12e\n", zeta);
+        printf(" Error is   %20.12e\n", zeta - zeta_verify_value);
+      } else {
+        verified = FALSE;
+        printf(" VERIFICATION FAILED\n");
+        printf(" Zeta                %20.12e\n", zeta);
+        printf(" The correct zeta is %20.12e\n", zeta_verify_value);
+      }
+    } else {
+      verified = FALSE;
+      printf(" Problem size unknown\n");
+      printf(" NO VERIFICATION PERFORMED\n");
+    }
+    if ( t != 0.0 ) {
+      mflops = (2.0*NITER*NA)
+      * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 )
+      / t / 1000000.0;
+    } else {
+      mflops = 0.0;
+    }
+    c_print_results((char*)"CG", class_npb, NA, 0, 0, NITER, nthreads, t, mflops, (char*)"          floating point", verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
+
+  }
+
+  return 0;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+static void conj_grad (
+  dash::Array<int, int, dash::CSRPattern<1>> &colidx,  /* colidx[1:nzz] */
+  dash::Array<int> &rowstr,  /* rowstr[1:naa+1] */
+  dash::Array<double> &x,  /* x[*] */
+  dash::Array<double> &z,  /* z[*] */
+  dash::Array<double, int, dash::CSRPattern<1>> &a,  /* a[1:nzz] */
+  dash::Array<double> &p,  /* p[*] */
+  dash::Array<double> &q,  /* q[*] */
+  dash::Array<double> &r,  /* r[*] */
+  dash::Array<double> &w,  /* w[*] */
+  double *rnorm )
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------
+c  Floaging point arrays here are named as in NPB1 spec discussion of
+c  CG algorithm
+c---------------------------------------------------------------------*/
+{
+  //static double d, sum, rho, rho0, alpha, beta;
+  //double d, sum, rho, rho0, alpha, beta;
+  dash::Array<double> d(dash::size());
+  dash::Array<double> sum(dash::size());
+  dash::Array<double> rho(dash::size());
+  dash::Array<double> rho0(dash::size());
+  dash::Array<double> alpha(dash::size());
+  dash::Array<double> beta(dash::size());
+  int j;
+  int cgit, cgitmax = 25;
+
+
+  rho.local[0] = 0.0;
+
+  int elem_per_unit = ceil(((double) NA+1)/dash::size());
+
+  /*--------------------------------------------------------------------
+  c  Initialize the CG algorithm:
+  c-------------------------------------------------------------------*/
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+  dash::fill(q.begin(), q.end(), 0.0);
+  dash::fill(z.begin(), z.end(), 0.0);
+  dash::fill(w.begin(), w.end(), 0.0);
+
+  std::copy(x.lbegin(), x.lend(), r.lbegin());
+  std::copy(r.lbegin(), r.lend(), p.lbegin());
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+  /*--------------------------------------------------------------------
+  c  rho = r.r
+  c  Now, obtain the norm of r: First, sum squares of r elements locally...
+  c-------------------------------------------------------------------*/
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+  for (auto it1 = r.lbegin(); it1 != r.lend(); it1++) {
+    rho.local[0] = rho.local[0] +  *it1 * *it1;
+  }
+
+  rho.local[0] = dash::reduce(rho.begin(), rho.end(), 0.0, std::plus<double>());
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+  /*--------------------------------------------------------------------
+  c---->
+  c  The conj grad iteration loop
+  c---->
+  c-------------------------------------------------------------------*/
+
+  for (cgit = 1; cgit <= cgitmax; cgit++) {
+
+    rho0.local[0] = rho.local[0];
+    d.local[0] = 0.0;
+    rho.local[0] = 0.0;
+
+
+    /*--------------------------------------------------------------------
+    c  q = A.p
+    c  The partition submatrix-vector multiply: use workspace w
+    c---------------------------------------------------------------------
+    C
+    C  NOTE: this version of the multiply is actually (slightly: maybe %5)
+    C        faster on the sp2 on 16 nodes than is the unrolled-by-2 version
+    C        below.   On the Cray t3d, the reverse is true, i.e., the
+    C        unrolled-by-two version is some 10% faster.
+    C        The unrolled-by-8 version below is significantly faster
+    C        on the Cray t3d - overall speed of code is 1.5 times faster.
+    */
+
+    double p_local[p.size()];
+    dash::copy(p.begin(), p.end(), &p_local[0]);
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    // rolled version
+
+    for (int j = 0; j < elem_per_unit; j++) {
+      double sum = 0.0;
+      for (int k = rowstr.local[j]; k < rowstr.local[j+1]; k++) {
+        sum = sum + a.local[k]*p_local[colidx.local[k]];
+      }
+      w.local[j] = sum;
+    }
+
+
+    //unrolled-by-two version
+    /*
+    for (int j = 0; j < elem_per_unit; j++) {
+      int iresidue;
+      double sum1, sum2;
+      int i = rowstr.local[j];
+      iresidue = (rowstr.local[j+1]-i) % 2;
+      sum1 = 0.0;
+      sum2 = 0.0;
+      if (iresidue == 1) sum1 = sum1 + a.local[i]*p_local[colidx.local[i]];
+      for (int k = i+iresidue; k <= rowstr.local[j+1]-2; k += 2) {
+        sum1 = sum1 + a.local[k]   * p_local[colidx.local[k]];
+        sum2 = sum2 + a.local[k+1] * p_local[colidx.local[k+1]];
+      }
+      w.local[j] = sum1 + sum2;
+    }*/
+
+
+    //unrolled-by-8 version
+    /*
+    for (int j = 0; j < elem_per_unit; j++) {
+      int iresidue, k;
+      int i = rowstr.local[j];
+      iresidue = (rowstr.local[j+1]-i) % 8;
+      double sum = 0.0;
+      for (k = i; k <= i+iresidue-1; k++) {
+        sum = sum +  a.local[k] * p_local[colidx.local[k]];
+      }
+      for (k = i+iresidue; k <= rowstr.local[j+1]-8; k += 8) {
+        sum = sum
+        + a.local[k  ] * p_local[colidx.local[k  ]]
+        + a.local[k+1] * p_local[colidx.local[k+1]]
+        + a.local[k+2] * p_local[colidx.local[k+2]]
+        + a.local[k+3] * p_local[colidx.local[k+3]]
+        + a.local[k+4] * p_local[colidx.local[k+4]]
+        + a.local[k+5] * p_local[colidx.local[k+5]]
+        + a.local[k+6] * p_local[colidx.local[k+6]]
+        + a.local[k+7] * p_local[colidx.local[k+7]];
+      }
+      w.local[j] = sum;
+    }*/
+
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+    std::copy(w.lbegin(), w.lend(), q.lbegin());
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+    /*--------------------------------------------------------------------
+    c  Clear w for reuse... Do we really need that? Commented it out for the moment.
+    c-------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+    //dash::fill(w.begin(), w.end(), 0.0);
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+    /*--------------------------------------------------------------------
+    c  Obtain p.q
+    c-------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    auto it2 = p.lbegin();
+    for (auto it1 = q.lbegin(); it1 != q.lend(); ++it1) {
+      d.local[0] += *it1 * *it2;
+      it2++;
+    }
+
+    d.local[0] = dash::reduce(d.begin(), d.end(), 0.0, std::plus<double>());
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+    /*--------------------------------------------------------------------
+    c  Obtain alpha = rho / (p.q)
+    c-------------------------------------------------------------------*/
+
+    alpha.local[0] = rho0.local[0] / d.local[0];
+
+    /*--------------------------------------------------------------------
+    c  Save a temporary of rho
+    c-------------------------------------------------------------------*/
+      /*  rho0 = rho;*/
+
+    /*---------------------------------------------------------------------
+    c  Obtain z = z + alpha*p
+    c  and    r = r - alpha*q
+    c---------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    it2 = p.lbegin();
+    for (auto it1 = z.lbegin(); it1 != z.lend(); ++it1) {
+      *it1 = *it1 + alpha.local[0]* *it2;
+      it2++;
+    }
+
+    it2 = q.lbegin();
+    for (auto it1 = r.lbegin(); it1 != r.lend(); ++it1) {
+      *it1 = *it1 - alpha.local[0]* *it2;
+      it2++;
+    }
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+    /*---------------------------------------------------------------------
+    c  rho = r.r
+    c  Now, obtain the norm of r: First, sum squares of r elements locally...
+    c---------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    for (auto it1 = r.lbegin(); it1 != r.lend(); ++it1) {
+      rho.local[0] += *it1 * *it1;
+    }
+
+    rho.local[0] = dash::reduce(rho.begin(), rho.end(), 0.0, std::plus<double>());
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+    /*--------------------------------------------------------------------
+    c  Obtain beta:
+    c-------------------------------------------------------------------*/
+
+    beta.local[0] = rho.local[0] / rho0.local[0];
+
+    /*--------------------------------------------------------------------
+    c  p = r + beta*p
+    c-------------------------------------------------------------------*/
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+    it2 = r.lbegin();
+    for (auto it1 = p.lbegin(); it1 != p.lend(); ++it1) {
+      *it1 = *it2 + beta.local[0]* *it1;
+      it2++;
+    }
+    dash::barrier();
+
+    if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+  } /* end of do cgit=1,cgitmax */
+
+  /*---------------------------------------------------------------------
+  c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
+  c  First, form A.z
+  c  The partition submatrix-vector multiply
+  c---------------------------------------------------------------------*/
+
+  sum.local[0] = 0.0;
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+  double z_local[p.size()];
+  dash::copy(z.begin(), z.end(), &z_local[0]);
+
+  for (int j = 0; j < elem_per_unit; j++) {
+    double sum = 0.0;
+    for (int k = rowstr.local[j]; k < rowstr.local[j+1]; k++) {
+      sum = sum + a.local[k]*z_local[colidx.local[k]];
+    }
+    w.local[j] = sum;
+  }
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+  std::copy(w.lbegin(), w.lend(), r.lbegin());
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+  /*--------------------------------------------------------------------
+  c  At this point, r contains A.z
+  c-------------------------------------------------------------------*/
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_start(2);
+
+  auto it2 = r.lbegin();
+  for (auto it1 = x.lbegin(); it1 != x.lend(); ++it1) {
+    sum.local[0] += (*it1 - *it2)*(*it1 - *it2);
+    it2++;
+  }
+
+  sum.local[0] = dash::reduce(sum.begin(), sum.end(), 0.0, std::plus<double>());
+
+  if ( TIMER_ENABLED == TRUE && 0 == dash::myid() ) timer_stop(2);
+
+  (*rnorm) = sqrt(sum.local[0]);
+}
+
+/*---------------------------------------------------------------------
+c       generate the test problem for benchmark 6
+c       makea generates a sparse matrix with a
+c       prescribed sparsity distribution
+c
+c       parameter    type        usage
+c
+c       input
+c
+c       n            i           number of cols/rows of matrix
+c       nz           i           nonzeros as declared array size
+c       rcond        r*8         condition number
+c       shift        r*8         main diagonal shift
+c
+c       output
+c
+c       a            r*8         array for nonzeros
+c       colidx       i           col indices
+c       rowstr       i           row pointers
+c
+c       workspace
+c
+c       iv, arow, acol i
+c       v, aelt        r*8
+c---------------------------------------------------------------------*/
+static void makea(
+  int n,
+  int nz,
+  double a[],    /* a[1:nz] */
+  int colidx[],  /* colidx[1:nz] */
+  int rowstr[],  /* rowstr[1:n+1] */
+  int nonzer,
+  int firstrow,
+  int lastrow,
+  int firstcol,
+  int lastcol,
+  double rcond,
+  int arow[],    /* arow[1:nz] */
+  int acol[],    /* acol[1:nz] */
+  double aelt[],  /* aelt[1:nz] */
+  double v[],    /* v[1:n+1] */
+  int iv[],    /* iv[1:2*n+1] */
+  double shift )
+{
+  int i, nnza, iouter, ivelt, ivelt1, irow, nzv;
+
+  /*--------------------------------------------------------------------
+  c      nonzer is approximately  (int(sqrt(nnza /n)));
+  c-------------------------------------------------------------------*/
+
+  double size, ratio, scale;
+  int jcol;
+
+  size = 1.0;
+  ratio = pow(rcond, (1.0 / (double)n));
+  nnza = 0;
+
+  /*---------------------------------------------------------------------
+  c  Initialize colidx(n+1 .. 2n) to zero.
+  c  Used by sprnvc to mark nonzero positions
+  c---------------------------------------------------------------------*/
+
+  for (i = 1; i <= n; i++) {
+    colidx[n+i] = 0;
+  }
+  for (iouter = 1; iouter <= n; iouter++) {
+    nzv = nonzer;
+    sprnvc(n, nzv, v, iv, &(colidx[0]), &(colidx[n]));
+    vecset(n, v, iv, &nzv, iouter, 0.5);
+    for (ivelt = 1; ivelt <= nzv; ivelt++){
+      jcol = iv[ivelt];
+      if (jcol >= firstcol && jcol <= lastcol) {
+        scale = size * v[ivelt];
+        for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) {
+          irow = iv[ivelt1];
+          if (irow >= firstrow && irow <= lastrow) {
+            nnza = nnza + 1;
+            if (nnza > nz) {
+              printf("Space for matrix elements exceeded in" " makea\n");
+              printf("nnza, nzmax = %d, %d\n", nnza, nz);
+              printf("iouter = %d\n", iouter);
+              exit(1);
+            }
+            acol[nnza] = jcol;
+            arow[nnza] = irow;
+            aelt[nnza] = v[ivelt1] * scale;
+          }
+        }
+      }
+    }
+    size = size * ratio;
+  }
+
+  /*---------------------------------------------------------------------
+  c       ... add the identity * rcond to the generated matrix to bound
+  c           the smallest eigenvalue from below by rcond
+  c---------------------------------------------------------------------*/
+  for (i = firstrow; i <= lastrow; i++) {
+    if (i >= firstcol && i <= lastcol) {
+      iouter = n + i;
+      nnza = nnza + 1;
+      if (nnza > nz) {
+        printf("Space for matrix elements exceeded in makea\n");
+        printf("nnza, nzmax = %d, %d\n", nnza, nz);
+        printf("iouter = %d\n", iouter);
+        exit(1);
+      }
+      acol[nnza] = i;
+      arow[nnza] = i;
+      aelt[nnza] = rcond - shift;
+    }
+  }
+
+  /*---------------------------------------------------------------------
+  c       ... make the sparse matrix from list of elements with duplicates
+  c           (v and iv are used as  workspace)
+  c---------------------------------------------------------------------*/
+  sparse(a, colidx, rowstr, n, arow, acol, aelt, firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza);
+}
+
+/*---------------------------------------------------
+c       generate a sparse matrix from a list of
+c       [col, row, element] tri
+c---------------------------------------------------*/
+static void sparse(
+  double a[],    /* a[1:*] */
+  int colidx[],  /* colidx[1:*] */
+  int rowstr[],  /* rowstr[1:*] */
+  int n,
+  int arow[],    /* arow[1:*] */
+  int acol[],    /* acol[1:*] */
+  double aelt[],  /* aelt[1:*] */
+  int firstrow,
+  int lastrow,
+  double x[],    /* x[1:n] */
+  boolean mark[],  /* mark[1:n] */
+  int nzloc[],  /* nzloc[1:n] */
+  int nnza)
+/*---------------------------------------------------------------------
+c       rows range from firstrow to lastrow
+c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
+c---------------------------------------------------------------------*/
+{
+  int nrows;
+  int i, j, jajp1, nza, k, nzrow;
+  double xi;
+
+  /*--------------------------------------------------------------------
+  c    how many rows of result
+  c-------------------------------------------------------------------*/
+  nrows = lastrow - firstrow + 1;
+
+  /*--------------------------------------------------------------------
+  c     ...count the number of triples in each row
+  c-------------------------------------------------------------------*/
+
+  for (j = 1; j <= n; j++) {
+    rowstr[j] = 0;
+    mark[j] = FALSE;
+  }
+  rowstr[n+1] = 0;
+
+  for (nza = 1; nza <= nnza; nza++) {
+    j = (arow[nza] - firstrow + 1) + 1;
+    rowstr[j] = rowstr[j] + 1;
+  }
+  rowstr[1] = 1;
+  for (j = 2; j <= nrows+1; j++) {
+    rowstr[j] = rowstr[j] + rowstr[j-1];
+  }
+
+  /*---------------------------------------------------------------------
+  c     ... rowstr(j) now is the location of the first nonzero
+  c           of row j of a
+  c---------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c     ... do a bucket sort of the triples on the row index
+  c-------------------------------------------------------------------*/
+  for (nza = 1; nza <= nnza; nza++) {
+    j = arow[nza] - firstrow + 1;
+    k = rowstr[j];
+    a[k] = aelt[nza];
+    colidx[k] = acol[nza];
+    rowstr[j] = rowstr[j] + 1;
+  }
+
+  /*--------------------------------------------------------------------
+  c       ... rowstr(j) now points to the first element of row j+1
+  c-------------------------------------------------------------------*/
+  for (j = nrows; j >= 1; j--) {
+    rowstr[j+1] = rowstr[j];
+  }
+  rowstr[1] = 1;
+
+  /*--------------------------------------------------------------------
+  c       ... generate the actual output rows by adding elements
+  c-------------------------------------------------------------------*/
+  nza = 0;
+
+  for (i = 1; i <= n; i++) {
+    x[i] = 0.0;
+    mark[i] = FALSE;
+  }
+
+  jajp1 = rowstr[1];
+  for (j = 1; j <= nrows; j++) {
+    nzrow = 0;
+
+    /*--------------------------------------------------------------------
+    c          ...loop over the jth row of a
+    c-------------------------------------------------------------------*/
+    for (k = jajp1; k < rowstr[j+1]; k++) {
+      i = colidx[k];
+      x[i] = x[i] + a[k];
+      if ( mark[i] == FALSE && x[i] != 0.0) {
+        mark[i] = TRUE;
+        nzrow = nzrow + 1;
+        nzloc[nzrow] = i;
+      }
+    }
+
+    /*--------------------------------------------------------------------
+    c          ... extract the nonzeros of this row
+    c-------------------------------------------------------------------*/
+    for (k = 1; k <= nzrow; k++) {
+      i = nzloc[k];
+      mark[i] = FALSE;
+      xi = x[i];
+      x[i] = 0.0;
+      if (xi != 0.0) {
+        nza = nza + 1;
+        a[nza] = xi;
+        colidx[nza] = i;
+      }
+    }
+    jajp1 = rowstr[j+1];
+    rowstr[j+1] = nza + rowstr[1];
+  }
+}
+
+/*---------------------------------------------------------------------
+c       generate a sparse n-vector (v, iv)
+c       having nzv nonzeros
+c
+c       mark(i) is set to 1 if position i is nonzero.
+c       mark is all zero on entry and is reset to all zero before exit
+c       this corrects a performance bug found by John G. Lewis, caused by
+c       reinitialization of mark on every one of the n calls to sprnvc
+---------------------------------------------------------------------*/
+static void sprnvc(
+  int n,
+  int nz,
+  double v[],    /* v[1:*] */
+  int iv[],    /* iv[1:*] */
+  int nzloc[],  /* nzloc[1:n] */
+  int mark[] )   /* mark[1:n] */
+{
+  int nn1;
+  int nzrow, nzv, ii, i;
+  double vecelt, vecloc;
+
+  nzv = 0;
+  nzrow = 0;
+  nn1 = 1;
+  do {
+    nn1 = 2 * nn1;
+  } while (nn1 < n);
+
+  /*--------------------------------------------------------------------
+  c    nn1 is the smallest power of two not less than n
+  c-------------------------------------------------------------------*/
+
+  while (nzv < nz) {
+    vecelt = randlc(&tran, amult);
+
+    /*--------------------------------------------------------------------
+    c   generate an integer between 1 and n in a portable manner
+    c-------------------------------------------------------------------*/
+    vecloc = randlc(&tran, amult);
+    i = icnvrt(vecloc, nn1) + 1;
+    if (i > n) continue;
+
+    /*--------------------------------------------------------------------
+    c  was this integer generated already?
+    c-------------------------------------------------------------------*/
+    if (mark[i] == 0) {
+      mark[i] = 1;
+      nzrow = nzrow + 1;
+      nzloc[nzrow] = i;
+      nzv = nzv + 1;
+      v[nzv] = vecelt;
+      iv[nzv] = i;
+    }
+  }
+
+  for (ii = 1; ii <= nzrow; ii++) {
+    i = nzloc[ii];
+    mark[i] = 0;
+  }
+}
+
+/*---------------------------------------------------------------------
+* scale a double precision number x in (0,1) by a power of 2 and chop it
+*---------------------------------------------------------------------*/
+static int icnvrt(double x, int ipwr2) {
+  return ((int)(ipwr2 * x));
+}
+
+/*--------------------------------------------------------------------
+c       set ith element of sparse vector (v, iv) with
+c       nzv nonzeros to val
+c-------------------------------------------------------------------*/
+static void vecset(
+  int n,
+  double v[],  /* v[1:*] */
+  int iv[],  /* iv[1:*] */
+  int *nzv,
+  int i,
+  double val)
+{
+  int k;
+  boolean set;
+
+  set = FALSE;
+  for (k = 1; k <= *nzv; k++) {
+    if (iv[k] == i) {
+      v[k] = val;
+      set  = TRUE;
+    }
+  }
+  if (set == FALSE) {
+    *nzv = *nzv + 1;
+    v[*nzv] = val;
+    iv[*nzv] = i;
+  }
+}
diff --git a/NPB/EP/Makefile b/NPB/EP/Makefile
new file mode 100644
index 0000000..c84337e
--- /dev/null
+++ b/NPB/EP/Makefile
@@ -0,0 +1,21 @@
+SHELL=/bin/sh
+BENCHMARK=ep
+BENCHMARKU=EP
+
+include ../config/make.def
+
+OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
+       ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} 
+
+
+ep.o:		ep.cpp npbparams.hpp
+	${CCOMPILE} ep.cpp
+
+clean:
+	- rm -f *.o *~
+	- rm -f npbparams.hpp core
diff --git a/NPB/EP/ep.cpp b/NPB/EP/ep.cpp
new file mode 100644
index 0000000..a1ba2a2
--- /dev/null
+++ b/NPB/EP/ep.cpp
@@ -0,0 +1,289 @@
+/*--------------------------------------------------------------------
+
+  Information on NAS Parallel Benchmarks is available at:
+
+  http://www.nas.nasa.gov/Software/NPB/
+
+  Authors: P. O. Frederickson
+     D. H. Bailey
+     A. C. Woo
+
+  STL version:
+  Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>
+
+  CPP and OpenMP version:
+  Dalvan Griebler <dalvangriebler@gmail.com>
+  Júnior Löff <loffjh@gmail.com>
+
+--------------------------------------------------------------------*/
+#include <libdash.h>
+
+#include "npbparams.hpp"
+#include <iostream>
+#include <../common/npb-CPP.hpp>
+
+
+/* parameters */
+#define MK    16
+#define MM    (M - MK)
+#define NN    (1 << MM)
+#define NK    (1 << MK)
+#define NQ    10
+#define EPSILON   1.0e-8
+#define A   1220703125.0
+#define S   271828183.0
+#define TIMERS_ENABLED  FALSE
+
+struct ep_res {
+  double sx;
+  double sy;
+  double q[NQ];
+};
+
+/*--------------------------------------------------------------------
+    program EMBAR
+c-------------------------------------------------------------------*/
+/*
+c   This is the serial version of the APP Benchmark 1,
+c   the "embarassingly parallel" benchmark.
+c
+c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
+c   numbers.  MK is the Log_2 of the size of each batch of uniform random
+c   numbers.  MK can be set for convenience on a given system, since it does
+c   not affect the results.
+*/
+int main(int argc, char **argv) {
+
+  dash::init(&argc, &argv);
+
+  double Mops, t1, sx, sy, tm, an, gc;
+  double dum[3] = { 1.0, 1.0, 1.0 };
+  int np, i, nit, k_offset, j;
+  boolean verified;
+
+  int nthreads = dash::size();
+
+  if ( 0 == dash::myid() ) {
+
+  char size[13+1];	/* character*13 */
+  /*
+  c   Because the size of the problem is too large to store in a 32-bit
+  c   integer for some classes, we put it into a string (for printing).
+  c   Have to strip off the decimal point put in there by the floating
+  c   point print statement (internal file)
+  */
+
+  printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version"" - EP Benchmark\n");
+  printf("\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n");
+  printf("\n\n DASH version by: Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>\n");
+  sprintf(size, "%12.0f", pow(2.0, M+1));
+  for (j = 13; j >= 1; j--) {
+  if ( size[j] == '.' ) size[j] = ' ';
+  }
+  printf(" Number of random numbers generated: %13s\n", size);
+
+  verified = FALSE;
+  }
+  /*
+  c   Compute the number of "batches" of random number pairs generated
+  c   per processor. Adjust if the number of processors does not evenly
+  c   divide the total number
+  */
+  np = NN;
+
+  /*
+  c   Call the random number generator functions and initialize
+  c   the x-array to reduce the effects of paging on the timings.
+  c   Also, call all mathematical functions that are used. Make
+  c   sure these initializations cannot be eliminated as dead code.
+  */
+  vranlc(0, &(dum[0]), dum[1], &(dum[2]));
+  dum[0] = randlc(&(dum[1]), dum[2]);
+
+  double my_x[2*NK+1];
+  for (i = 0; i < 2*NK+1; i++) my_x[i] = -1.0e99;
+  Mops = log(sqrt(fabs(max(1.0, 1.0))));
+
+  if ( 0 == dash::myid() ) {
+
+  timer_clear(1);
+  timer_clear(2);
+  timer_clear(3);
+  timer_clear(4);
+
+  timer_start(1);
+  }
+
+  vranlc(0, &t1, A, my_x);
+
+  /*   Compute AN = A ^ (2 * NK) (mod 2^46). */
+
+  t1 = A;
+
+  for (i = 1; i <= MK+1; i++) {
+  an = randlc(&t1, t1);
+  }
+
+  dash::Array<ep_res> res(nthreads);
+
+  an = t1;
+  gc = 0.0;
+  res.local[0].sx = 0.0;
+  res.local[0].sy = 0.0;
+
+  for ( i = 0; i <= NQ - 1; i++) {
+  res.local[0].q[i] = 0.0;
+  }
+
+  /*
+  c   Each instance of this loop may be performed independently. We compute
+  c   the k offsets separately to take into account the fact that some nodes
+  c   have more numbers to generate than others
+  */
+  if ( TIMERS_ENABLED == TRUE ) timer_start(4);
+
+  dash::Array<int> v(np);
+
+  //if( 0 == dash::myid()) std::iota(v.begin(), v.end(), 0);
+
+  std::iota(v.lbegin(), v.lend(), dash::myid() * ceil((double) np / dash::size())); //only works when using blocking pattern
+
+  v.barrier();
+
+  dash::for_each(v.begin(), v.end(), [&an, &my_x, &res](int k) {
+
+    double t1, t2, t3, t4, x1, x2;
+    int kk, i, ik, l;
+
+    kk = k;
+    t1 = S;
+    t2 = an;
+
+    //  Find starting seed t1 for this kk.
+
+    for (i = 1; i <= 100; i++) {
+      ik = kk / 2;
+      if (2 * ik != kk) t3 = randlc(&t1, t2);
+      if (ik == 0) break;
+      t3 = randlc(&t2, t2);
+      kk = ik;
+    }
+
+    //	Compute uniform pseudorandom numbers.
+
+    //if (TIMERS_ENABLED == TRUE) timer_start(3);
+    vranlc(2*NK, &t1, A, my_x);
+    //if (TIMERS_ENABLED == TRUE) timer_stop(3);
+
+    //
+    //c	   Compute Gaussian deviates by acceptance-rejection method and
+    //c	   tally counts in concentric square annuli.  This loop is not
+    //c	   vectorizable.
+    //
+    //if (TIMERS_ENABLED == TRUE) timer_start(2);
+
+    for (i = 1; i <= NK; i++) {
+
+      x1 = 2.0 * my_x[2*i-1] - 1.0;
+      x2 = 2.0 * my_x[2*i] - 1.0;
+      t1 = pow2(x1) + pow2(x2);
+      if ( t1 <= 1.0 ) {
+        t2 = sqrt(-2.0 * log(t1) / t1);
+        t3 = (x1 * t2);		// Xi
+        t4 = (x2 * t2);		// Yi
+        l = max(fabs(t3), fabs(t4));
+        res.local[0].q[l] += 1.0;		// counts
+        res.local[0].sx = res.local[0].sx + t3;		// sum of Xi
+        res.local[0].sy = res.local[0].sy + t4;		// sum of Yi
+      }
+    }
+    //if (TIMERS_ENABLED == TRUE) timer_stop(2);
+  });
+
+  v.barrier();
+
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(4);
+
+  ep_res zero;
+
+  res[0] = dash::reduce(res.begin(), res.end(), zero, [](ep_res a, ep_res b){
+    ep_res c;
+    c.sx = a.sx + b.sx;
+    c.sy = a.sy + b.sy;
+    for (int i = 0; i < NQ; ++i) c.q[i] = a.q[i] + b.q[i];
+    return c;
+  });
+
+  res.barrier();
+
+  if ( 0 == dash::myid() ) {
+
+    for (i = 0; i <= NQ-1; i++) {
+      gc = gc + res.local[0].q[i];
+    }
+
+    sx = res.local[0].sx;
+    sy = res.local[0].sy;
+
+    timer_stop(1);
+    tm = timer_read(1);
+
+
+
+    nit = 0;
+    if ( M == 24 ) {
+      if ( (fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 25 ) {
+      if ( (fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 28 ) {
+      //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
+      if ( (fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 30 ) {
+      if ( (fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 32 ) {
+      if ( (fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 36 ) {
+      if ( (fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    } else if ( M == 40 ) {
+      if ( (fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON) ) {
+        verified = TRUE;
+      }
+    }
+
+    Mops = pow(2.0, M+1)/tm/1000000.0;
+
+    printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n"
+       "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy);
+    for (i = 0; i  <= NQ-1; i++) {
+      printf("%3d %15.0f\n", i, res.local[0].q[i]);
+    }
+
+    c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, (char*)"Random numbers generated",
+    verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
+
+    if ( TIMERS_ENABLED == TRUE ) {
+      printf("Total time:	 %f\n", timer_read(1));
+      printf("Gaussian pairs: %f\n", timer_read(2));
+      printf("Random numbers: %f\n", timer_read(3));
+    }
+
+  }
+
+
+
+  dash::finalize();
+
+  return 0;
+}
diff --git a/NPB/FT/Makefile b/NPB/FT/Makefile
new file mode 100644
index 0000000..7e1e4a8
--- /dev/null
+++ b/NPB/FT/Makefile
@@ -0,0 +1,20 @@
+SHELL=/bin/sh
+BENCHMARK=ft
+BENCHMARKU=FT
+
+include ../config/make.def
+
+OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \
+       ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS} 
+
+ft.o:             ft.cpp  global.hpp npbparams.hpp
+	${CCOMPILE} ft.cpp
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f ft npbparams.hpp core
diff --git a/NPB/FT/ft.cpp b/NPB/FT/ft.cpp
new file mode 100644
index 0000000..0574755
--- /dev/null
+++ b/NPB/FT/ft.cpp
@@ -0,0 +1,1160 @@
+/*--------------------------------------------------------------------
+
+  Information on NAS Parallel Benchmarks is available at:
+
+  http://www.nas.nasa.gov/Software/NPB/
+
+  Authors: D. Bailey
+     W. Saphir
+
+  STL version:
+  Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>
+
+  CPP and OpenMP version:
+      Dalvan Griebler <dalvangriebler@gmail.com>
+      Júnior Löff <loffjh@gmail.com>
+
+--------------------------------------------------------------------*/
+
+#include <libdash.h>
+
+#include <iostream>
+#include "npb-CPP.hpp"
+
+/* global variables */
+#include "global.hpp"
+
+/* function declarations */
+static void evolve(dash::NArray<dcomplex, 3> &u0, dash::NArray<dcomplex, 3> &u1, int t, dash::NArray<int, 3> &indexmap);
+static void compute_initial_conditions(dash::NArray<dcomplex, 3> &u0);
+static void ipow46(double a, int exponent, double *result);
+static void setup(void);
+static void compute_indexmap(dash::NArray<int, 3> &indexmap);
+static void print_timers(void);
+static void fft(int dir, dash::NArray<dcomplex, 3> &x1, dash::NArray<dcomplex, 3> &x2);
+static void cffts1(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout);
+static void cffts2(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout);
+static void cffts3(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout);
+static void fft_init (int n);
+static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]);
+static void fftz2 (int is, int l, int m, int n, int ny, int ny1, dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]);
+static int ilog2(int n);
+static void checksum(int i, dash::NArray<dcomplex, 3> &u1);
+static void verify (int d1, int d2, int d3, int nt, boolean *verified, char *class_npb);
+
+/*--------------------------------------------------------------------
+c FT benchmark
+c-------------------------------------------------------------------*/
+
+int main(int argc, char **argv) {
+
+
+  dash::init(&argc, &argv);
+  /*c-------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int i;
+
+  /*------------------------------------------------------------------
+  c u0, u1, u2 are the main arrays in the problem.
+  c Depending on the decomposition, these arrays will have different
+  c dimensions. To accomodate all possibilities, we allocate them as
+  c one-dimensional arrays and pass them to subroutines for different
+  c views
+  c  - u0 contains the initial (transformed) initial condition
+  c  - u1 and u2 are working arrays
+  c  - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the
+  c  time evolution operator.
+  c-----------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c Large arrays are in common so that they are allocated on the
+  c heap rather than the stack. This common block is not
+  c referenced directly anywhere else. Padding is to avoid accidental
+  c cache problems, since all array sizes are powers of two.
+  c-------------------------------------------------------------------*/
+  auto distspec = dash::DistributionSpec<3>( dash::BLOCKED, dash::NONE , dash::NONE);
+
+  static dash::NArray<dcomplex, 3> u0(NZ, NY, NX, distspec);
+  /*static dcomplex pad1[3];*/
+  static dash::NArray<dcomplex, 3> u1(NZ, NY, NX, distspec);
+  /*static dcomplex pad2[3];*/
+  static dash::NArray<dcomplex, 3> u2(NZ, NY, NX, distspec);
+  /*static dcomplex pad3[3];*/
+  static dash::NArray<int, 3> indexmap(NZ, NY, NX, distspec);
+
+  int nthreads = dash::size();
+
+  int iter;
+  double total_time, mflops;
+  boolean verified;
+  char class_npb;
+
+  /*--------------------------------------------------------------------
+  c Run the entire problem once to make sure all data is touched.
+  c This reduces variable startup costs, which is important for such a
+  c short benchmark. The other NPB 2 implementations are similar.
+  c-------------------------------------------------------------------*/
+  if ( 0 == dash::myid() ) {
+    for (i = 0; i < T_MAX; i++) {
+      timer_clear(i);
+    }
+  }
+  setup();
+
+  compute_indexmap(indexmap);
+  compute_initial_conditions(u1);
+  fft_init (dims[0][0]);
+  dash::barrier();
+
+  fft(1, u1, u0);
+  dash::barrier();
+
+  /*--------------------------------------------------------------------
+  c Start over from the beginning. Note that all operations must
+  c be timed, in contrast to other benchmarks.
+  c-------------------------------------------------------------------*/
+  if ( 0 == dash::myid() ) {
+    for (i = 0; i < T_MAX; i++) {
+      timer_clear(i);
+    }
+
+    //std::clear();
+
+    timer_start(T_TOTAL);
+    if ( TIMERS_ENABLED == TRUE ) timer_start(T_SETUP);
+  }
+
+  compute_indexmap(indexmap);
+
+  compute_initial_conditions(u1);
+
+  fft_init (dims[0][0]);
+  dash::barrier();
+
+  if ( 0 == dash::myid() ) {
+    if ( TIMERS_ENABLED == TRUE ) {
+      timer_stop(T_SETUP);
+    }
+    if ( TIMERS_ENABLED == TRUE ) {
+      timer_start(T_FFT);
+    }
+  }
+
+  fft(1, u1, u0);
+  dash::barrier();
+
+  if ( 0 == dash::myid() ) {
+    if (TIMERS_ENABLED == TRUE) {
+      timer_stop(T_FFT);
+    }
+  }
+
+  for (iter = 1; iter <= niter; iter++) {
+
+    if ( 0 == dash::myid() ) {
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_start(T_EVOLVE);
+      }
+    }
+
+    evolve(u0, u1, iter, indexmap);
+    dash::barrier();
+
+    if ( 0 == dash::myid() ) {
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_stop(T_EVOLVE);
+      }
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_start(T_FFT);
+      }
+    }
+
+    fft(-1, u1, u2);
+    dash::barrier();
+
+    if ( 0 == dash::myid() ) {
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_stop(T_FFT);
+      }
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_start(T_CHECKSUM);
+      }
+    }
+
+    checksum(iter, u2);
+
+    if ( 0 == dash::myid() ) {
+      if ( TIMERS_ENABLED == TRUE ) {
+        timer_stop(T_CHECKSUM);
+      }
+    }
+  }
+
+if ( 0 == dash::myid() ) {
+    verify(NX, NY, NZ, niter, &verified, &class_npb);
+
+    timer_stop(T_TOTAL);
+
+    total_time = timer_read(T_TOTAL);
+
+    if ( total_time != 0.0 ) {
+      mflops = 1.0e-6*(double)(NTOTAL) *
+      (14.8157+7.19641*log((double)(NTOTAL))
+      +  (5.23518+7.21113*log((double)(NTOTAL)))*niter)/total_time;
+    } else {
+      mflops = 0.0;
+    }
+    c_print_results((char*)"FT", class_npb, NX, NY, NZ, niter, nthreads, total_time, mflops, (char*)"      floating point", verified,
+    (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
+    if ( TIMERS_ENABLED == TRUE ) print_timers();
+  }
+  dash::finalize();
+
+  return 0;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void evolve(dash::NArray<dcomplex, 3> &u0, dash::NArray<dcomplex, 3> &u1, int t, dash::NArray<int, 3> &indexmap) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c evolve u0 -> u1 (t time steps) in fourier space
+  c-------------------------------------------------------------------*/
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  for (int k = 0; k < u1.local.extent(0); k++){
+    for (int j = 0; j < dims[0][1]; j++) {
+      for (int i = 0; i < NX; i++) {
+        crmul(u1.local(k,j,i), u0.local(k,j,i), ex[t*indexmap.local(k,j,i)]);
+      }
+    }
+  }
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void compute_initial_conditions(dash::NArray<dcomplex, 3> &u0) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c Fill in array u0 with initial conditions from
+  c random number generator
+  c-------------------------------------------------------------------*/
+
+  /*double x0, start, an, dummy;*/
+  double start, an;
+  //double tmp[NX*2*MAXDIM+1];
+  dash::Array<double> starts(NZ*dash::size());
+
+  start = SEED;
+  /*--------------------------------------------------------------------
+  c Jump to the starting element for our first plane.
+  c-------------------------------------------------------------------*/
+  ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an);
+  /*dummy = */randlc(&start, an);
+  ipow46(A, 2*NX*NY, &an);
+
+  starts.local[0] = start;
+  for (int i=1; i<dims[0][2]; i++){
+    randlc(&start, an);
+    starts.local[i] = start;
+  }
+
+  /*--------------------------------------------------------------------
+  c Go through by z planes filling in one square at a time.
+  c-------------------------------------------------------------------*/
+  int zplanes_per_unit = ceil((double) u0.extent(0) / dash::size());
+  int myoffset = dash::myid()*zplanes_per_unit;
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  for (int k = 0; k < u0.local.extent(0); k++){
+    double * tmp = new double[NX*2*MAXDIM+1];
+
+    double x0 = starts.local[k+myoffset];
+    vranlc(2*NX*dims[0][1], &x0, A, tmp);
+
+    int t = 1;
+    for (int j = 0; j < dims[0][1]; j++) {
+      for (int i = 0; i < NX; i++) {
+        u0.local(k,j,i).real = tmp[t++];
+        u0.local(k,j,i).imag = tmp[t++];
+      }
+      //if (k != dims[0][2]) /*dummy = */randlc(&start, an);
+    }
+  }
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void ipow46(double a, int exponent, double *result) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c compute a^exponent mod 2^46
+  c-------------------------------------------------------------------*/
+
+  /*double dummy, q, r;*/
+  double q, r;
+  int n, n2;
+
+  /*--------------------------------------------------------------------
+  c Use
+  c   a^n = a^(n/2)*a^(n/2) if n even else
+  c   a^n = a*a^(n-1)     if n odd
+  c-------------------------------------------------------------------*/
+  *result = 1;
+  if ( exponent == 0 ) return;
+  q = a;
+  r = 1;
+  n = exponent;
+
+  while (n > 1) {
+    n2 = n/2;
+    if ( n2 * 2 == n ) {
+      /*dummy = */randlc(&q, q);
+      n = n2;
+    } else {
+      /*dummy = */randlc(&r, q);
+      n = n-1;
+    }
+  }
+  /*dummy = */randlc(&r, q);
+  *result = r;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void setup(void) {
+
+  int i;
+  niter = NITER_DEFAULT;
+  if ( 0 == dash::myid() ) {
+    /*--------------------------------------------------------------------
+    c-------------------------------------------------------------------*/
+
+    /*int ierr, i, j, fstatus;*/
+    //int i;
+
+    printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version" " - FT Benchmark\n\n");
+    printf("\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n");
+    printf("\n\n DASH version by: Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>\n");
+
+    printf(" Size        : %3dx%3dx%3d\n", NX, NY, NZ);
+    printf(" Iterations      :   %7d\n", niter);
+
+    /* 1004 format(' Number of processes :   ', i7)
+    1005 format(' Processor array   :   ', i3, 'x', i3)
+    1006 format(' WARNING: compiled for ', i5, ' processes. ',
+    >     ' Will not verify. ')*/
+  }
+
+  for (i = 0;i < 3 ; i++) {
+    dims[i][0] = NX;
+    dims[i][1] = NY;
+    dims[i][2] = NZ;
+  }
+
+
+  for (i = 0; i < 3; i++) {
+    xstart[i] = 1;
+    xend[i]   = NX;
+    ystart[i] = 1;
+    yend[i]   = NY;
+    zstart[i] = 1;
+    zend[i]   = NZ;
+  }
+
+  /*--------------------------------------------------------------------
+  c Set up info for blocking of ffts and transposes.  This improves
+  c performance on cache-based systems. Blocking involves
+  c working on a chunk of the problem at a time, taking chunks
+  c along the first, second, or third dimension.
+  c
+  c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+  c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+  c
+  c Since 1st dim is always in processor, we'll assume it's long enough
+  c (default blocking factor is 16 so min size for 1st dim is 16)
+  c The only case we have to worry about is cffts1 in a 2d decomposition.
+  c so the blocking factor should not be larger than the 2nd dimension.
+  c-------------------------------------------------------------------*/
+
+  fftblock = FFTBLOCK_DEFAULT;
+  fftblockpad = FFTBLOCKPAD_DEFAULT;
+
+  if ( fftblock != FFTBLOCK_DEFAULT ) fftblockpad = fftblock+3;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void compute_indexmap(dash::NArray<int, 3> &indexmap) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+  c for time evolution exponent.
+  c-------------------------------------------------------------------*/
+
+  int i;
+  double ap;
+
+  /*--------------------------------------------------------------------
+  c basically we want to convert the fortran indices
+  c   1 2 3 4 5 6 7 8
+  c to
+  c   0 1 2 3 -4 -3 -2 -1
+  c The following magic formula does the trick:
+  c mod(i-1+n/2, n) - n/2
+  c-------------------------------------------------------------------*/
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  int zplanes_per_unit = ceil((double) indexmap.extent(0) / dash::size());
+  int myoffset = dash::myid()*zplanes_per_unit;
+
+  for (int k = 0; k < indexmap.local.extent(0); k++) {
+    int kk = (k+myoffset+1+zstart[2]-2+NZ/2)%NZ - NZ/2;
+    int kk2 = kk*kk;
+    for (int j = 0; j < dims[2][1]; j++) {
+      int jj = (j+1+ystart[2]-2+NY/2)%NY - NY/2;
+      int jj2 = jj*jj;
+      for (int i = 0; i < dims[2][0]; i++) {
+        int ii =  (i+1+xstart[2]-2+NX/2)%NX - NX/2;
+        indexmap.local(k,j,i) = kk2+jj2+ii*ii;
+      }
+    }
+  }
+
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+
+  /*--------------------------------------------------------------------
+  c compute array of exponentials for time evolution.
+  c-------------------------------------------------------------------*/
+  ap = - 4.0 * ALPHA * PI * PI;
+
+  ex[0] = 1.0;
+  ex[1] = exp(ap);
+  for (i = 2; i <= EXPMAX; i++) {
+    ex[i] = ex[i-1]*ex[1];
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void print_timers(void) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int i;
+  const char *tstrings[] = {
+  "    total ",
+  "    setup ",
+  "      fft ",
+  "   evolve ",
+  " checksum ",
+  "   fftlow ",
+  "  fftcopy "};
+
+  for (i = 0; i < T_MAX; i++) {
+    if ( timer_read(i) != 0.0 ) {
+      printf("timer %2d(%s) :%10.6f\n", i, tstrings[i], timer_read(i));
+    }
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void fft(int dir, dash::NArray<dcomplex, 3> &x1, dash::NArray<dcomplex, 3> &x2) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c note: args x1, x2 must be different arrays
+  c note: args for cfftsx are (direction, layout, xin, xout, scratch)
+  c     xin/xout may be the same and it can be somewhat faster
+  c     if they are
+  c-------------------------------------------------------------------*/
+
+  if ( dir == 1 ) {
+    cffts1(1, dims[0], x1, x1);/* x1 -> x1 */
+    cffts2(1, dims[1], x1, x1);/* x1 -> x1 */
+    dash::barrier();
+    cffts3(1, dims[2], x1, x2);/* x1 -> x2 */
+  } else {
+    cffts3(-1, dims[2], x1, x1);/* x1 -> x1 */
+    dash::barrier();
+    cffts2(-1, dims[1], x1, x1);/* x1 -> x1 */
+    cffts1(-1, dims[0], x1, x2);/* x1 -> x2 */
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void cffts1(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int logd[3];
+
+  for (int i = 0; i < 3; i++) {
+    logd[i] = ilog2(d[i]);
+  }
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  for (int k = 0; k < x.local.extent(0); k++) {
+    dcomplex y0[NX][FFTBLOCKPAD];
+    dcomplex y1[NX][FFTBLOCKPAD];
+
+    for (int jj = 0; jj <= d[1] - fftblock; jj+=fftblock) {
+        //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+        for (int j = 0; j < fftblock; j++) {
+          for (int i = 0; i < d[0]; i++) {
+            y0[i][j].real = x.local(k,j+jj,i).real;
+            y0[i][j].imag = x.local(k,j+jj,i).imag;
+          }
+        }
+        //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+        //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW);
+        cfftz (is, logd[0], d[0], y0, y1);
+        //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW);
+        //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+        for (int j = 0; j < fftblock; j++) {
+          for (int i = 0; i < d[0]; i++) {
+            xout.local(k,j+jj,i).real = y0[i][j].real;
+            xout.local(k,j+jj,i).imag = y0[i][j].imag;
+          }
+        }
+      //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+      }
+  }
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void cffts2(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int logd[3];
+
+  for (int i = 0; i < 3; i++) {
+    logd[i] = ilog2(d[i]);
+  }
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  for (int k = 0; k < x.local.extent(0); k++) {
+    dcomplex y0[NX][FFTBLOCKPAD];
+    dcomplex y1[NX][FFTBLOCKPAD];
+
+    for (int ii = 0; ii <= d[0] - fftblock; ii+=fftblock) {
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+      for (int j = 0; j < d[1]; j++) {
+        for (int i = 0; i < fftblock; i++) {
+          y0[j][i].real = x.local(k,j,i+ii).real;
+          y0[j][i].imag = x.local(k,j,i+ii).imag;
+        }
+        // int offset = k*NX*NY+j*NX;
+        // std::copy(x.lbegin()+offset+ii, x.lbegin()+offset+ii+fftblock, &y0[j][0]);
+      }
+      //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW);
+      cfftz (is, logd[1], d[1], y0, y1);
+      //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW);
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+      for (int j = 0; j < d[1]; j++) {
+        // for (int i = 0; i < fftblock; i++) {
+        //   xout.local(k,j,i+ii).real = y0[j][i].real;
+        //   xout.local(k,j,i+ii).imag = y0[j][i].imag;
+        // }
+        int offset = k*NX*NY+j*NX;
+        std::copy(&y0[j][0], &y0[j][fftblock], xout.lbegin()+offset+ii);
+        //
+        // The combination of reading in sequential and writing with std::copy
+        // seems to be the fastest with my setup. This definitely needs
+        // further investigation.
+        //
+      }
+    //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+    }
+  }
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void cffts3(int is, int d[3], dash::NArray<dcomplex, 3> &x, dash::NArray<dcomplex, 3> &xout) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int logd[3];
+
+  for (int i = 0;i < 3; i++) {
+    logd[i] = ilog2(d[i]);
+  }
+
+  if ( TIMERS_ENABLED == TRUE ) timer_start(T_MAX);
+
+  int yplanes_per_unit = ceil((double) d[1] / dash::size()); //this is effectively a blocked pattern. It could be better to use cyclic for higher unit counts.
+  int myoffset = dash::myid()*yplanes_per_unit;
+
+  for (int j = 0; j < yplanes_per_unit && myoffset+j < d[1]; j++) {
+
+    dcomplex y0[NX][FFTBLOCKPAD];
+    dcomplex y1[NX][FFTBLOCKPAD];
+
+    // dash::Future<dcomplex*> futs[d[2]];
+    // typedef typename dash::Future<dash::GlobIter<dcomplex, dash::BlockPattern<3, (dash::MemArrange)1, long int>, dash::GlobStaticMem<dash::HostSpace>, dash::GlobPtr<dcomplex, dash::GlobStaticMem<dash::HostSpace> >, dash::GlobRef<dcomplex> > > async_write_type;
+    // async_write_type futs_w[d[2]];
+
+    //
+    // Asynchronous copy does not seem to yield anything, at best it is equally good.
+    //
+
+    for (int ii = 0; ii <= d[0] - fftblock; ii+=fftblock) {
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+      for (int k = 0; k < d[2]; k++) {
+        // for (int i = 0; i < fftblock; i++) {
+           //y0[k][i] = x(k,myoffset+j,i+ii);
+        // }
+        int offset = k*NX*NY+(myoffset+j)*NX;
+        dash::copy(x.begin()+offset+ii, x.begin()+offset+ii+fftblock, &y0[k][0]);
+        // futs[k] = dash::copy_async(x.begin()+offset+ii, x.begin()+offset+ii+fftblock, &y0[k][0]);
+      }
+
+      // for (int k = 0; k < d[2]; k++) futs[k].wait();
+
+      //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW);
+      cfftz (is, logd[2], d[2], y0, y1);
+      //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW);
+      //  if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY);
+      for (int k = 0; k < d[2]; k++) {
+        // for (int i = 0; i < fftblock; i++) {
+          // xout(k,myoffset+j,i+ii) = y0[k][i];
+        // }
+        int offset = k*NX*NY+(myoffset+j)*NX;
+        dash::copy(&y0[k][0], &y0[k][fftblock], xout.begin()+offset+ii);
+        // futs_w[k] = dash::copy_async(&y0[k][0], &y0[k][fftblock], xout.begin()+offset+ii);
+      }
+      // for (int k = 0; k < d[2]; k++) futs_w[k].wait();
+    //  if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY);
+    }
+  }
+  if ( TIMERS_ENABLED == TRUE ) timer_stop(T_MAX);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void fft_init (int n) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c compute the roots-of-unity array that will be used for subsequent FFTs.
+  c-------------------------------------------------------------------*/
+
+  /*int m,nu,ku,i,j,ln;*/
+  int m,ku,i,j,ln;
+  double t, ti;
+
+
+  /*--------------------------------------------------------------------
+  c   Initialize the U array with sines and cosines in a manner that permits
+  c   stride one access at each FFT iteration.
+  c-------------------------------------------------------------------*/
+  /*nu = n;*/
+  m = ilog2(n);
+  u[0].real = (double)m;
+  u[0].imag = 0.0;
+  ku = 1;
+  ln = 1;
+
+  for (j = 1; j <= m; j++) {
+    t = PI / ln;
+
+    for (i = 0; i <= ln - 1; i++) {
+      ti = i * t;
+      u[i+ku].real = cos(ti);
+      u[i+ku].imag = sin(ti);
+    }
+
+    ku = ku + ln;
+    ln = 2 * ln;
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+  c   to Swarztrauber.  X is both the input and the output array, while Y is a
+  c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+  c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+  c   set to 0 and M set to MX, where MX is the maximum value of M for any
+  c   subsequent call.
+  c-------------------------------------------------------------------*/
+
+  int i,j,l,mx;
+
+  /*--------------------------------------------------------------------
+  c   Check if input parameters are invalid.
+  c-------------------------------------------------------------------*/
+  mx = (int)(u[0].real);
+  if ( (is != 1 && is != -1) || m < 1 || m > mx ) {
+    printf("CFFTZ: Either U has not been initialized, or else\n"
+    "one of the input parameters is invalid%5d%5d%5d\n",
+    is, m, mx);
+    exit(1);
+  }
+
+  /*--------------------------------------------------------------------
+  c   Perform one variant of the Stockham FFT.
+  c-------------------------------------------------------------------*/
+  for (l = 1; l <= m; l+=2) {
+    fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y);
+    if ( l == m ) break;
+    fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x);
+  }
+
+  /*--------------------------------------------------------------------
+  c   Copy Y to X.
+  c-------------------------------------------------------------------*/
+  if ( m % 2 == 1 ) {
+    for (j = 0; j < n; j++) {
+      for (i = 0; i < fftblock; i++) {
+        x[j][i].real = y[j][i].real;
+        x[j][i].imag = y[j][i].imag;
+      }
+    }
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void fftz2 (int is, int l, int m, int n, int ny, int ny1, dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD]) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   Performs the L-th iteration of the second variant of the Stockham FFT.
+  c-------------------------------------------------------------------*/
+
+  int k,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22;
+  /*dcomplex u1,x11,x21;*/
+  dcomplex u1;
+
+  /*--------------------------------------------------------------------
+  c   Set initial parameters.
+  c-------------------------------------------------------------------*/
+
+  n1 = n / 2;
+  if ( l-1 == 0 ) {
+    lk = 1;
+  } else {
+    lk = 2 << ((l - 1)-1);
+  }
+  if ( m-l == 0 ) {
+    li = 1;
+  } else {
+    li = 2 << ((m - l)-1);
+  }
+  lj = 2 * lk;
+  ku = li;
+
+  for (i = 0; i < li; i++) {
+
+    i11 = i * lk;
+    i12 = i11 + n1;
+    i21 = i * lj;
+    i22 = i21 + lk;
+    if ( is >= 1 ) {
+      u1.real = u[ku+i].real;
+      u1.imag = u[ku+i].imag;
+    } else {
+      u1.real = u[ku+i].real;
+      u1.imag = -u[ku+i].imag;
+    }
+
+    /*--------------------------------------------------------------------
+    c   This loop is vectorizable.
+    c-------------------------------------------------------------------*/
+    //but it creates more overhead
+
+    for (k = 0; k < lk; k++) {
+      for (j = 0; j < ny; j++) {
+        double x11real, x11imag;
+        double x21real, x21imag;
+        x11real = x[i11+k][j].real;
+        x11imag = x[i11+k][j].imag;
+        x21real = x[i12+k][j].real;
+        x21imag = x[i12+k][j].imag;
+        y[i21+k][j].real = x11real + x21real;
+        y[i21+k][j].imag = x11imag + x21imag;
+        y[i22+k][j].real = u1.real * (x11real - x21real)
+        - u1.imag * (x11imag - x21imag);
+        y[i22+k][j].imag = u1.real * (x11imag - x21imag)
+        + u1.imag * (x11real - x21real);
+      }
+    }
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static int ilog2(int n) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int nn, lg;
+
+  if ( n == 1 ) {
+    return 0;
+  }
+  lg = 1;
+  nn = 2;
+  while (nn < n) {
+    nn = nn << 1;
+    lg++;
+  }
+
+  return lg;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void checksum(int i, dash::NArray<dcomplex, 3> &u1) {
+if ( 0 == dash::myid() ) {
+    /*--------------------------------------------------------------------
+    c-------------------------------------------------------------------*/
+    /*int j, q,r,s, ierr;*/
+    int j, q,r,s;
+    /*dcomplex chk,allchk;*/
+    dcomplex chk;
+
+    chk.real = 0.0;
+    chk.imag = 0.0;
+
+    for (j = 1; j <= 1024; j++) {
+        q = j%NX+1;
+        if ( q >= xstart[0] && q <= xend[0] ) {
+            r = (3*j)%NY+1;
+            if ( r >= ystart[0] && r <= yend[0] ) {
+                s = (5*j)%NZ+1;
+                if ( s >= zstart[0] && s <= zend[0] ) {
+                    dcomplex v1 = u1(s-zstart[0],r-ystart[0],q-xstart[0]);
+                    cadd(chk,chk,v1);
+                }
+            }
+        }
+    }
+
+    {
+        sums[i].real += chk.real;
+        sums[i].imag += chk.imag;
+    }
+
+
+    {
+        /* complex % real */
+        sums[i].real = sums[i].real/(double)(NTOTAL);
+        sums[i].imag = sums[i].imag/(double)(NTOTAL);
+
+        printf("T = %5d     Checksum = %22.12e %22.12e\n",
+        i, sums[i].real, sums[i].imag);
+    }
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void verify (int d1, int d2, int d3, int nt, boolean *verified, char *class_npb) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*int ierr, size, i;*/
+  int i;
+  double err, epsilon;
+
+  /*--------------------------------------------------------------------
+  c   Sample size reference checksums
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   class_npb S size reference checksums
+  c-------------------------------------------------------------------*/
+  double vdata_real_s[6+1] = { 0.0,
+         5.546087004964e+02,
+         5.546385409189e+02,
+         5.546148406171e+02,
+         5.545423607415e+02,
+         5.544255039624e+02,
+         5.542683411902e+02 };
+  double vdata_imag_s[6+1] = { 0.0,
+         4.845363331978e+02,
+         4.865304269511e+02,
+         4.883910722336e+02,
+         4.901273169046e+02,
+         4.917475857993e+02,
+         4.932597244941e+02 };
+  /*--------------------------------------------------------------------
+  c   class_npb W size reference checksums
+  c-------------------------------------------------------------------*/
+  double vdata_real_w[6+1] = { 0.0,
+         5.673612178944e+02,
+         5.631436885271e+02,
+         5.594024089970e+02,
+         5.560698047020e+02,
+         5.530898991250e+02,
+         5.504159734538e+02 };
+  double vdata_imag_w[6+1] = { 0.0,
+         5.293246849175e+02,
+         5.282149986629e+02,
+         5.270996558037e+02,
+         5.260027904925e+02,
+         5.249400845633e+02,
+         5.239212247086e+02 };
+  /*--------------------------------------------------------------------
+  c   class_npb A size reference checksums
+  c-------------------------------------------------------------------*/
+  double vdata_real_a[6+1] = { 0.0,
+         5.046735008193e+02,
+         5.059412319734e+02,
+         5.069376896287e+02,
+         5.077892868474e+02,
+         5.085233095391e+02,
+         5.091487099959e+02 };
+  double vdata_imag_a[6+1] = { 0.0,
+         5.114047905510e+02,
+         5.098809666433e+02,
+         5.098144042213e+02,
+         5.101336130759e+02,
+         5.104914655194e+02,
+         5.107917842803e+02 };
+  /*--------------------------------------------------------------------
+  c   class_npb B size reference checksums
+  c-------------------------------------------------------------------*/
+  double vdata_real_b[20+1] = { 0.0,
+          5.177643571579e+02,
+          5.154521291263e+02,
+          5.146409228649e+02,
+          5.142378756213e+02,
+          5.139626667737e+02,
+          5.137423460082e+02,
+          5.135547056878e+02,
+          5.133910925466e+02,
+          5.132470705390e+02,
+          5.131197729984e+02,
+          5.130070319283e+02,
+          5.129070537032e+02,
+          5.128182883502e+02,
+          5.127393733383e+02,
+          5.126691062020e+02,
+          5.126064276004e+02,
+          5.125504076570e+02,
+          5.125002331720e+02,
+          5.124551951846e+02,
+          5.124146770029e+02 };
+  double vdata_imag_b[20+1] = { 0.0,
+          5.077803458597e+02,
+          5.088249431599e+02,
+          5.096208912659e+02,
+          5.101023387619e+02,
+          5.103976610617e+02,
+          5.105948019802e+02,
+          5.107404165783e+02,
+          5.108576573661e+02,
+          5.109577278523e+02,
+          5.110460304483e+02,
+          5.111252433800e+02,
+          5.111968077718e+02,
+          5.112616233064e+02,
+          5.113203605551e+02,
+          5.113735928093e+02,
+          5.114218460548e+02,
+          5.114656139760e+02,
+          5.115053595966e+02,
+          5.115415130407e+02,
+          5.115744692211e+02 };
+  /*--------------------------------------------------------------------
+  c   class_npb C size reference checksums
+  c-------------------------------------------------------------------*/
+  double vdata_real_c[20+1] = { 0.0,
+          5.195078707457e+02,
+          5.155422171134e+02,
+          5.144678022222e+02,
+          5.140150594328e+02,
+          5.137550426810e+02,
+          5.135811056728e+02,
+          5.134569343165e+02,
+          5.133651975661e+02,
+          5.132955192805e+02,
+          5.132410471738e+02,
+          5.131971141679e+02,
+          5.131605205716e+02,
+          5.131290734194e+02,
+          5.131012720314e+02,
+          5.130760908195e+02,
+          5.130528295923e+02,
+          5.130310107773e+02,
+          5.130103090133e+02,
+          5.129905029333e+02,
+          5.129714421109e+02 };
+  double vdata_imag_c[20+1] = { 0.0,
+          5.149019699238e+02,
+          5.127578201997e+02,
+          5.122251847514e+02,
+          5.121090289018e+02,
+          5.121143685824e+02,
+          5.121496764568e+02,
+          5.121870921893e+02,
+          5.122193250322e+02,
+          5.122454735794e+02,
+          5.122663649603e+02,
+          5.122830879827e+02,
+          5.122965869718e+02,
+          5.123075927445e+02,
+          5.123166486553e+02,
+          5.123241541685e+02,
+          5.123304037599e+02,
+          5.123356167976e+02,
+          5.123399592211e+02,
+          5.123435588985e+02,
+          5.123465164008e+02 };
+
+  epsilon = 1.0e-12;
+  *verified = TRUE;
+  *class_npb = 'U';
+
+  if ( d1 == 64 &&
+  d2 == 64 &&
+  d3 == 64 &&
+  nt == 6 ) {
+  *class_npb = 'S';
+  for (i = 1; i <= nt; i++) {
+    err = (get_real(sums[i]) - vdata_real_s[i]) / vdata_real_s[i];
+    if ( fabs(err) > epsilon ) {
+      *verified = FALSE;
+      break;
+    }
+    err = (get_imag(sums[i]) - vdata_imag_s[i]) / vdata_imag_s[i];
+    if ( fabs(err) > epsilon ) {
+      *verified = FALSE;
+      break;
+    }
+  }
+  } else if ( d1 == 128 &&
+    d2 == 128 &&
+    d3 == 32 &&
+    nt == 6 ) {
+    *class_npb = 'W';
+    for (i = 1; i <= nt; i++) {
+      err = (get_real(sums[i]) - vdata_real_w[i]) / vdata_real_w[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+      err = ( get_imag(sums[i] ) - vdata_imag_w[i]) / vdata_imag_w[i];
+      if (fabs(err) > epsilon) {
+        *verified = FALSE;
+        break;
+      }
+    }
+  } else if ( d1 == 256 &&
+    d2 == 256 &&
+    d3 == 128 &&
+    nt == 6 ) {
+    *class_npb = 'A';
+    for (i = 1; i <= nt; i++) {
+      err = (get_real(sums[i]) - vdata_real_a[i]) / vdata_real_a[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+      err = (get_imag(sums[i]) - vdata_imag_a[i]) / vdata_imag_a[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+    }
+  } else if ( d1 == 512 && d2 == 256 && d3 == 256 && nt == 20 ) {
+    *class_npb = 'B';
+    for (i = 1; i <= nt; i++) {
+      err = (get_real(sums[i]) - vdata_real_b[i]) / vdata_real_b[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+      err = (get_imag(sums[i]) - vdata_imag_b[i]) / vdata_imag_b[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+    }
+  } else if ( d1 == 512 &&
+    d2 == 512 &&
+    d3 == 512 &&
+    nt == 20 ) {
+    *class_npb = 'C';
+    for (i = 1; i <= nt; i++) {
+      err = (get_real(sums[i]) - vdata_real_c[i]) / vdata_real_c[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+      err = (get_imag(sums[i]) - vdata_imag_c[i]) / vdata_imag_c[i];
+      if ( fabs(err) > epsilon ) {
+        *verified = FALSE;
+        break;
+      }
+    }
+  }
+
+  if ( *class_npb != 'U' ) {
+    printf("Result verification successful\n");
+  } else {
+    printf("Result verification failed\n");
+  }
+  printf("class_npb = %1c\n", *class_npb);
+}
diff --git a/NPB/FT/global.hpp b/NPB/FT/global.hpp
new file mode 100644
index 0000000..a47501d
--- /dev/null
+++ b/NPB/FT/global.hpp
@@ -0,0 +1,111 @@
+#include "npbparams.hpp"
+
+
+/*
+c If processor array is 1x1 -> 0D grid decomposition
+
+
+c Cache blocking params. These values are good for most
+c RISC processors.
+c FFT parameters:
+c  fftblock controls how many ffts are done at a time.
+c  The default is appropriate for most cache-based machines
+c  On vector machines, the FFT can be vectorized with vector
+c  length equal to the block size, so the block size should
+c  be as large as possible. This is the size of the smallest
+c  dimension of the problem: 128 for class A, 256 for class B and
+c  512 for class C.
+*/
+
+#define	FFTBLOCK_DEFAULT	64
+#define	FFTBLOCKPAD_DEFAULT	64 //original: 18
+
+#define FFTBLOCK	FFTBLOCK_DEFAULT
+#define FFTBLOCKPAD	FFTBLOCKPAD_DEFAULT
+
+/* COMMON block: blockinfo */
+int fftblock;
+int fftblockpad;
+
+/*
+c we need a bunch of logic to keep track of how
+c arrays are laid out.
+
+
+c Note: this serial version is the derived from the parallel 0D case
+c of the ft NPB.
+c The computation proceeds logically as
+
+c set up initial conditions
+c fftx(1)
+c transpose (1->2)
+c ffty(2)
+c transpose (2->3)
+c fftz(3)
+c time evolution
+c fftz(3)
+c transpose (3->2)
+c ffty(2)
+c transpose (2->1)
+c fftx(1)
+c compute residual(1)
+
+c for the 0D, 1D, 2D strategies, the layouts look like xxx
+c
+c            0D        1D        2D
+c 1:        xyz       xyz       xyz
+c 2:        xyz       xyz       yxz
+c 3:        xyz       zyx       zxy
+
+c the array dimensions are stored in dims(coord, phase)
+*/
+
+/* COMMON block: layout */
+static int dims[3][3];
+static int xstart[3];
+static int ystart[3];
+static int zstart[3];
+static int xend[3];
+static int yend[3];
+static int zend[3];
+
+#define	T_TOTAL		0
+#define	T_SETUP		1
+#define	T_FFT		2
+#define	T_EVOLVE	3
+#define	T_CHECKSUM	4
+#define	T_FFTLOW	5
+#define	T_FFTCOPY	6
+#define	T_MAX		7
+
+#define	TIMERS_ENABLED	FALSE
+
+/* other stuff */
+
+#define	SEED	314159265.0
+#define	A	1220703125.0
+#define	PI	3.141592653589793238
+#define	ALPHA	1.0e-6
+
+#define	EXPMAX	(NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4))
+
+/* COMMON block: excomm */
+static double ex[EXPMAX+1];	/* ex(0:expmax) */
+
+/*
+c roots of unity array
+c relies on x being largest dimension?
+*/
+
+/* COMMON block: ucomm */
+static dcomplex u[NX];
+
+/* for checksum data */
+
+/* COMMON block: sumcomm */
+static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */
+
+/* number of iterations*/
+
+/* COMMON block: iter */
+static int niter;
diff --git a/NPB/IS/Makefile b/NPB/IS/Makefile
new file mode 100644
index 0000000..40b2865
--- /dev/null
+++ b/NPB/IS/Makefile
@@ -0,0 +1,24 @@
+SHELL=/bin/sh
+BENCHMARK=is
+BENCHMARKU=IS
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = is.o \
+       ${COMMON}/c_print_results.o \
+       ${COMMON}/c_timers.o \
+       ${COMMON}/c_wtime.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS}
+
+is.o:             is.cpp  npbparams.hpp
+	${CCOMPILE} is.cpp
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f npbparams.hpp core
+	- if [ -d rii_files ]; then rm -r rii_files; fi
diff --git a/NPB/IS/is.cpp b/NPB/IS/is.cpp
new file mode 100644
index 0000000..f182724
--- /dev/null
+++ b/NPB/IS/is.cpp
@@ -0,0 +1,996 @@
+/*--------------------------------------------------------------------
+
+    Information on NAS Parallel Benchmarks is available at:
+
+    http://www.nas.nasa.gov/Software/NPB/
+
+    Authors: M. Yarrow
+         H. Jin
+
+    CPP and OpenMP version:
+            Dalvan Griebler <dalvangriebler@gmail.com>
+            Júnior Löff <loffjh@gmail.com>
+
+--------------------------------------------------------------------*/
+
+#include <libdash.h>
+
+#include "npbparams.hpp"
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+
+
+/*****************************************************************/
+/* For serial IS, buckets are not really req'd to solve NPB1 IS  */
+/* spec, but their use on some machines improves performance, on */
+/* other machines the use of buckets compromises performance,    */
+/* probably because it is extra computation which is not req'd.  */
+/* (Note: Mechanism not understood, probably cache related)      */
+/* Example:  SP2-66MhzWN:  50% speedup with buckets              */
+/* Example:  SGI Indy5000: 50% slowdown with buckets             */
+/* Example:  SGI O2000:   400% slowdown with buckets (Wow!)      */
+/*****************************************************************/
+/* To disable the use of buckets, comment out the following line */
+//#define USE_BUCKETS
+
+/* Uncomment below for cyclic schedule */
+/*#define SCHED_CYCLIC*/
+
+
+/******************/
+/* default values */
+/******************/
+#ifndef CLASS
+#define CLASS 'S'
+#endif
+
+
+/*************/
+/*  CLASS S  */
+/*************/
+#if CLASS == 'S'
+#define  TOTAL_KEYS_LOG_2    16
+#define  MAX_KEY_LOG_2       11
+#define  NUM_BUCKETS_LOG_2   9
+#endif
+
+
+/*************/
+/*  CLASS W  */
+/*************/
+#if CLASS == 'W'
+#define  TOTAL_KEYS_LOG_2    20
+#define  MAX_KEY_LOG_2       16
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+/*************/
+/*  CLASS A  */
+/*************/
+#if CLASS == 'A'
+#define  TOTAL_KEYS_LOG_2    23
+#define  MAX_KEY_LOG_2       19
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS B  */
+/*************/
+#if CLASS == 'B'
+#define  TOTAL_KEYS_LOG_2    25
+#define  MAX_KEY_LOG_2       21
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS C  */
+/*************/
+#if CLASS == 'C'
+#define  TOTAL_KEYS_LOG_2    27
+#define  MAX_KEY_LOG_2       23
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS D  */
+/*************/
+#if CLASS == 'D'
+#define  TOTAL_KEYS_LOG_2    31
+#define  MAX_KEY_LOG_2       27
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+#if CLASS == 'D'
+#define  TOTAL_KEYS          (1L << TOTAL_KEYS_LOG_2)
+#else
+#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
+#endif
+#define  MAX_KEY             (1 << MAX_KEY_LOG_2)
+#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
+#define  NUM_KEYS            TOTAL_KEYS
+#define  SIZE_OF_BUFFERS     NUM_KEYS
+
+
+#define  MAX_ITERATIONS      10
+#define  TEST_ARRAY_SIZE     5
+
+
+/*************************************/
+/* Typedef: if necessary, change the */
+/* size of int here by changing the  */
+/* int type to, say, long            */
+/*************************************/
+#if CLASS == 'D'
+typedef  long INT_TYPE;
+#else
+typedef  int  INT_TYPE;
+#endif
+
+
+/********************/
+/* Some global info */
+/********************/
+int passed_verification;
+
+
+/************************************/
+/* These are the three main arrays. */
+/* See SIZE_OF_BUFFERS def above    */
+/************************************/
+dash::Array<INT_TYPE> key_array;
+dash::Array<INT_TYPE> key_buff2;
+
+INT_TYPE partial_verify_vals[TEST_ARRAY_SIZE];
+
+#ifdef USE_BUCKETS
+dash::Array<INT_TYPE> bucket_size;
+
+using pattern_t = dash::CSRPattern<1>;
+using array_t   = dash::Array<INT_TYPE, pattern_t::index_type, pattern_t>;
+
+array_t key_buff1;
+#else
+dash::Array<INT_TYPE> key_buff1;
+
+dash::Array<INT_TYPE> work_buffers;
+#endif
+
+
+/**********************/
+/* Partial verif info */
+/**********************/
+INT_TYPE test_index_array[TEST_ARRAY_SIZE],
+         test_rank_array[TEST_ARRAY_SIZE],
+
+         S_test_index_array[TEST_ARRAY_SIZE] =
+{48427,17148,23627,62548,4431},
+S_test_rank_array[TEST_ARRAY_SIZE] =
+{0,18,346,64917,65463},
+
+W_test_index_array[TEST_ARRAY_SIZE] =
+{357773,934767,875723,898999,404505},
+W_test_rank_array[TEST_ARRAY_SIZE] =
+{1249,11698,1039987,1043896,1048018},
+
+A_test_index_array[TEST_ARRAY_SIZE] =
+{2112377,662041,5336171,3642833,4250760},
+A_test_rank_array[TEST_ARRAY_SIZE] =
+{104,17523,123928,8288932,8388264},
+
+B_test_index_array[TEST_ARRAY_SIZE] =
+{41869,812306,5102857,18232239,26860214},
+B_test_rank_array[TEST_ARRAY_SIZE] =
+{33422937,10244,59149,33135281,99},
+
+C_test_index_array[TEST_ARRAY_SIZE] =
+{44172927,72999161,74326391,129606274,21736814},
+C_test_rank_array[TEST_ARRAY_SIZE] =
+{61147,882988,266290,133997595,133525895},
+
+D_test_index_array[TEST_ARRAY_SIZE] =
+{1317351170,995930646,1157283250,1503301535,1453734525},
+D_test_rank_array[TEST_ARRAY_SIZE] =
+{1,36538729,1978098519,2145192618,2147425337};
+
+
+/***********************/
+/* function prototypes */
+/***********************/
+double  randlc( double *X, double *A );
+
+void full_verify( void );
+
+/*void c_print_results( char   *name,
+                      char   class,
+                      int    n1,
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      double t,
+                      double mops,
+          char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *cc,
+                      char   *clink,
+                      char   *c_lib,
+                      char   *c_inc,
+                      char   *cflags,
+                      char   *clinkflags );*/
+void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, int  nthreads, double t,
+                      double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
+                      char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand);
+
+void    timer_clear( int n );
+void    timer_start( int n );
+void    timer_stop( int n );
+double  timer_read( int n );
+
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+static int      KS=0;
+static double  R23, R46, T23, T46;
+
+double  randlc( double *X, double *A ) {
+
+  double    T1, T2, T3, T4;
+  double    A1;
+  double    A2;
+  double    X1;
+  double    X2;
+  double    Z;
+  int         i, j;
+
+  if (KS == 0) {
+
+    R23 = 1.0;
+    R46 = 1.0;
+    T23 = 1.0;
+    T46 = 1.0;
+
+    for (i=1; i<=23; i++) {
+      R23 = 0.50 * R23;
+      T23 = 2.0 * T23;
+    }
+
+    for (i=1; i<=46; i++) {
+      R46 = 0.50 * R46;
+      T46 = 2.0 * T46;
+    }
+
+    KS = 1;
+  }
+
+  /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+  T1 = R23 * *A;
+  j  = T1;
+  A1 = j;
+  A2 = *A - T23 * A1;
+
+  /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+  Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+  X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+  T1 = R23 * *X;
+  j  = T1;
+  X1 = j;
+  X2 = *X - T23 * X1;
+  T1 = A1 * X2 + A2 * X1;
+
+  j  = R23 * T1;
+  T2 = j;
+  Z = T1 - T23 * T2;
+  T3 = T23 * Z + A2 * X2;
+  j  = R46 * T3;
+  T4 = j;
+  *X = T3 - T46 * T4;
+  return(R46 * *X);
+}
+
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
+/*
+ * Create a random number sequence of total length nn residing
+ * on np number of processors.  Each processor will therefore have a
+ * subsequence of length nn/np.  This routine returns that random
+ * number which is the first random number for the subsequence belonging
+ * to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+
+double   find_my_seed( int kn,        /* my processor rank, 0<=kn<=num procs */
+                       int np,        /* np = num procs                      */
+                       long nn,       /* total num of ran numbers, all procs */
+                       double s,      /* Ran num seed, for ex.: 314159265.00 */
+                       double a )     /* Ran num gen mult, try 1220703125.00 */
+{
+
+  double t1,t2;
+  long   mq,nq,kk,ik;
+
+  if ( kn == 0 ) return s;
+
+  mq = (nn/4 + np - 1) / np;
+  nq = mq * 4 * kn;               /* number of rans to be skipped */
+
+  t1 = s;
+  t2 = a;
+  kk = nq;
+  while ( kk > 1 ) {
+    ik = kk / 2;
+    if ( 2 * ik ==  kk ) {
+      (void)randlc( &t2, &t2 );
+      kk = ik;
+    }
+    else {
+      (void)randlc( &t1, &t2 );
+      kk = kk - 1;
+    }
+  }
+  (void)randlc( &t1, &t2 );
+
+  return( t1 );
+}
+
+
+
+/*****************************************************************/
+/*************      C  R  E  A  T  E  _  S  E  Q      ************/
+/*****************************************************************/
+
+void  create_seq( double seed, double a ) {
+
+  double x, s;
+  INT_TYPE i, k;
+
+  INT_TYPE k1, k2;
+  double an = a;
+  int myid, num_procs;
+  INT_TYPE mq;
+
+  myid = dash::myid();
+  num_procs = dash::size();
+
+  mq = (NUM_KEYS + num_procs - 1) / num_procs;
+  k1 = mq * myid;
+  k2 = k1 + mq;
+  if ( k2 > NUM_KEYS ) k2 = NUM_KEYS;
+
+  KS = 0;
+  s = find_my_seed( myid, num_procs, (long)4*NUM_KEYS, seed, an );
+
+  k = MAX_KEY/4;
+
+  for (i=k1; i<k2; i++) {
+    x = randlc(&s, &an);
+    x += randlc(&s, &an);
+    x += randlc(&s, &an);
+    x += randlc(&s, &an);
+
+    key_array[i] = k*x;
+  }
+}
+
+
+
+/*****************************************************************/
+/*****************    Allocate Working Buffer     ****************/
+/*****************************************************************/
+void *alloc_mem( size_t size ) {
+
+  void *p;
+
+  p = (void *)malloc(size);
+  if ( !p ) {
+    perror("Memory allocation error");
+    exit(1);
+  }
+  return p;
+}
+
+void alloc_key_buff( void ) {
+
+  INT_TYPE i;
+  int      num_procs;
+
+
+  num_procs = dash::size();
+
+  key_array.allocate(NUM_KEYS, dash::BLOCKED);
+  key_buff2.allocate(NUM_KEYS, dash::BLOCKED);
+
+#ifdef USE_BUCKETS
+
+    bucket_size.allocate(NUM_BUCKETS*num_procs, dash::BLOCKED);
+
+    int shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+    INT_TYPE num_bucket_keys = (1L << shift);
+
+    std::vector<pattern_t::size_type> l_sizes;
+    dash::Array<int> dummy(NUM_BUCKETS);
+
+    dash::Array<pattern_t::size_type> my_elem_count(dash::size());
+
+    my_elem_count.local[0] = dummy.lsize() * num_bucket_keys;
+
+    dash::barrier();
+
+    for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) {
+      l_sizes.push_back(my_elem_count[unit_idx]);
+    }
+
+    pattern_t pattern(l_sizes);
+
+    key_buff1.allocate(pattern);
+
+#else
+    key_buff1.allocate(MAX_KEY, dash::BLOCKED);
+
+    work_buffers.allocate(num_procs*MAX_KEY);
+#endif /*USE_BUCKETS*/
+}
+
+
+
+/*****************************************************************/
+/*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
+/*****************************************************************/
+
+
+void full_verify( void ) {
+
+  /*  Now, finally, sort the keys:  */
+
+  /*  Copy keys into work array; keys in key_array will be reassigned. */
+
+  std::copy(key_array.lbegin(), key_array.lend(), key_buff2.lbegin());
+
+  key_array.barrier();
+
+  /* This is actual sorting. Each thread is responsible for
+  a subset of key values */
+  INT_TYPE j = dash::size();
+  j = (MAX_KEY + j - 1) / j;
+  INT_TYPE k1 = j * dash::myid();
+  INT_TYPE k2 = k1 + j;
+  if ( k2 > MAX_KEY ) k2 = MAX_KEY;
+
+  std::for_each(key_buff2.begin(), key_buff2.end(), [k1, k2](INT_TYPE i) {
+    if (i >= k1 && i < k2) {
+      key_array[--key_buff1[i]] = i;
+    }
+  });
+
+  dash::barrier();
+
+  /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+
+  dash::Array<INT_TYPE> faults(dash::size());
+  dash::fill(faults.begin(), faults.end(), 0);
+
+  dash::Array<INT_TYPE> v2(NUM_KEYS-1);
+
+  //std::iota(v2.lbegin(), v2.lend(), dash::myid() * ceil((double) NUM_KEYS / dash::size())); //only works when using blocking pattern
+  if ( 0 == dash::myid() ) std::iota(v2.begin(), v2.end(), 1);
+
+  v2.barrier();
+
+  dash::for_each(v2.begin(), v2.end(), [&faults, &v2](INT_TYPE i) {
+    if ( key_array[i-1] > key_array[i] ){
+      printf("Fault at %d of %d\n", i, (int) v2[NUM_KEYS-1]);
+      faults.local[0]++;
+    }
+  });
+
+  dash::barrier();
+
+
+  if ( 0 == dash::myid() ) {
+    for (int i = 1; i < dash::size(); i++) faults[0] += faults[i];
+
+    if ( faults[0] != 0 )
+      printf( "Full_verify: number of keys out of sort: %ld\n", (long) faults[0] );
+      else
+      passed_verification++;
+  }
+}
+
+
+
+
+/*****************************************************************/
+/*************             R  A  N  K             ****************/
+/*****************************************************************/
+
+
+void rank( int iteration ) {
+
+  INT_TYPE    i, k;
+
+#ifdef USE_BUCKETS
+    int shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+    INT_TYPE num_bucket_keys = (1L << shift);
+#endif
+
+  key_array[iteration] = iteration;
+  key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
+
+
+  /*  Determine where the partial verify test keys are, load into  */
+  /*  top of array bucket_size                                     */
+  for (i=0; i<TEST_ARRAY_SIZE; i++)
+    partial_verify_vals[i] = key_array[test_index_array[i]];
+
+
+  int myid = dash::myid();
+  int num_procs = dash::size();
+
+  /*  Bucket sort is known to improve cache performance on some   */
+  /*  cache based systems.  But the actual performance may depend */
+  /*  on cache size, problem size. */
+#ifdef USE_BUCKETS
+
+    //INT_TYPE bucket_ptrs_local[NUM_BUCKETS];
+    dash::Array<INT_TYPE> bucket_ptrs(NUM_BUCKETS*num_procs);
+
+    /*  Initialize */
+    for (i=0; i<NUM_BUCKETS; i++)
+      bucket_size.local[i] = 0;
+
+    /*  Determine the number of keys in each bucket */
+    dash::for_each(key_array.begin(), key_array.end(), [shift](int k) {
+      bucket_size.local[k >> shift]++;
+    });
+
+    /*  Accumulative bucket sizes are the bucket pointers.
+    These are global sizes accumulated upon to each bucket */
+    bucket_ptrs.local[0] = 0;
+    for (k=0; k< myid; k++)  {
+      bucket_ptrs.local[0] += bucket_size[k*NUM_BUCKETS];
+    }
+
+    for (i=1; i< NUM_BUCKETS; i++) {
+      bucket_ptrs.local[i] = bucket_ptrs.local[i-1];
+      for (k=0; k< myid; k++)
+        bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i];
+        for (k=myid; k< num_procs; k++)
+          bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i-1];
+    }
+
+    ////////////////////////////////////
+    dash::barrier();
+    typedef dash::CSRPattern<1>           pattern_t;
+    typedef int                           index_t;
+    typedef typename pattern_t::size_type extent_t;
+
+    std::vector<extent_t> local_sizes;
+    dash::Array<int> my_elem_count(dash::size());
+    int local_buckets = ceil((double) NUM_BUCKETS / dash::size());
+
+    my_elem_count.local[0] = ((myid == num_procs - 1)? NUM_KEYS : bucket_ptrs[(myid + 1)*local_buckets]) - bucket_ptrs[(myid + 0)*local_buckets];
+
+    dash::barrier();
+
+    for (int unit_idx = 0; unit_idx < dash::size(); ++unit_idx) {
+      local_sizes.push_back(my_elem_count[unit_idx]);
+    }
+
+    pattern_t pattern(local_sizes);
+
+    dash::Array<INT_TYPE, index_t, pattern_t> key_buff2l(pattern);
+
+    ////////////////////////////////7
+    dash::for_each(key_array.begin(), key_array.end(), [&shift, &bucket_ptrs, &key_buff2l](int k) {
+      key_buff2l[bucket_ptrs.local[k >> shift]++] = k;
+    });
+
+    if ( myid < num_procs-1 ) {
+      for (i=0; i< NUM_BUCKETS; i++)
+        for (k=myid+1; k< num_procs; k++)
+          bucket_ptrs.local[i] += bucket_size[k*NUM_BUCKETS+i];
+    }
+
+
+    /*  Now, buckets are sorted.  We only need to sort keys inside
+    each bucket, which can be done in parallel.  Because the distribution
+    of the number of keys in the buckets is Gaussian, the use of
+    a dynamic schedule should improve load balance, thus, performance     */
+
+    dash::Array<int> v(NUM_BUCKETS);
+
+    //if ( 0 == dash::myid()) std::iota(v.begin(), v.end(), 0);
+
+    std::iota(v.lbegin(), v.lend(), dash::myid() * local_buckets); //only works when using blocking pattern
+
+    v.barrier();
+
+    dash::for_each(v.begin(), v.end(), [&num_bucket_keys, &bucket_ptrs, &key_buff2l, &local_buckets, &myid](int i)  {
+
+      int key_buff1_offset = local_buckets * myid * num_bucket_keys;
+      /*  Clear the work array section associated with each bucket */
+      INT_TYPE j;
+      INT_TYPE k1 = i * num_bucket_keys - key_buff1_offset;
+      INT_TYPE k2 = k1 + num_bucket_keys;
+      for (j = k1; j < k2; j++)
+        key_buff1.local[j] = 0;
+
+      /*  Ranking of all keys occurs in this section:                 */
+      int key_buff2_offset = (myid > 0)? bucket_ptrs.local[myid*local_buckets-1] : 0;
+
+      /*  In this section, the keys themselves are used as their
+      own indexes to determine how many of each there are: their
+      individual population                                       */
+      INT_TYPE m = (i > 0)? bucket_ptrs.local[i-1] : 0;
+      for (j = m - key_buff2_offset; j < bucket_ptrs.local[i] - key_buff2_offset; j++)
+        key_buff1.local[key_buff2l.local[j] - key_buff1_offset]++;  /* Now they have individual key population */
+
+        /*  To obtain ranks of each key, successively add the individual key
+        population, not forgetting to add m, the total of lesser keys,
+        to the first key population                                          */
+        key_buff1.local[k1] += m;
+        for (j = k1+1; j < k2; j++)
+          key_buff1.local[j] += key_buff1.local[j-1];
+      });
+
+#else /*USE_BUCKETS*/
+
+    std::fill(work_buffers.lbegin(), work_buffers.lend(), 0);
+
+    // compute the histogram for the local keys
+    for (int i=0; i<key_array.lsize(); i++) {
+        work_buffers.local[ key_array.local[i] ]++;
+      }
+
+    // turn it into a cumulative histogram
+    for (int i=0; i<MAX_KEY-1; i++) {
+      work_buffers.local[i+1] += work_buffers.local[i];
+    }
+
+    // compute the offset of this unit's local part in
+    // the global key_buff1 array
+    auto& pat = key_buff1.pattern();
+    int goffs = pat.global(0);
+
+    for (int i=0; i<key_buff1.lsize(); i++) {
+      key_buff1.local[i] = work_buffers.local[goffs+i];
+    }
+
+    dash::barrier();
+
+    for (int unit=1; unit<num_procs; unit++) {
+      for (int i=0; i<key_buff1.lsize(); i++) {
+        key_buff1.local[i] += work_buffers[((myid+unit)%num_procs)*MAX_KEY+goffs+i];
+      }
+    }
+
+    dash::barrier();
+
+#endif /*USE_BUCKETS*/
+
+
+  /* This is the partial verify test section */
+  /* Observe that test_rank_array vals are   */
+  /* shifted differently for different cases */
+
+  if ( 0 == myid ) {
+
+    for ( i=0; i<TEST_ARRAY_SIZE; i++ ) {
+
+      k = partial_verify_vals[i];          // test vals were put here
+      if ( 0 < k  &&  k <= NUM_KEYS-1 ) {
+
+        INT_TYPE key_rank = key_buff1[k-1];
+        int failed = 0;
+
+        switch( CLASS ) {
+          case 'S':
+            if ( i <= 2 ) {
+              if ( key_rank != test_rank_array[i]+iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+          case 'W':
+            if ( i < 2 ) {
+              if ( key_rank != test_rank_array[i]+(iteration-2) )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+          case 'A':
+            if ( i <= 2 ) {
+              if ( key_rank != test_rank_array[i]+(iteration-1) )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-(iteration-1) )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+          case 'B':
+            if ( i == 1 || i == 2 || i == 4 ) {
+              if ( key_rank != test_rank_array[i]+iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+          case 'C':
+            if ( i <= 2 ) {
+              if ( key_rank != test_rank_array[i]+iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+          case 'D':
+            if ( i < 2 ) {
+              if ( key_rank != test_rank_array[i]+iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            } else {
+              if ( key_rank != test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                passed_verification++;
+            }
+            break;
+        }
+        if ( failed == 1 )
+          printf( "Failed partial verification: "
+                  "iteration %d, test key %d\n",
+                  iteration, (int)i );
+      }
+    }
+  }
+
+}
+
+
+/*****************************************************************/
+/*************             M  A  I  N             ****************/
+/*****************************************************************/
+
+int main( int argc, char **argv ) {
+
+  dash::init(&argc, &argv);
+
+  int nthreads=dash::size();
+  int   i, iteration, timer_on;
+  double  timecounter;
+
+  if ( 0 == dash::myid() ) {
+
+    FILE *fp;
+
+
+    /*  Initialize timers  */
+    timer_on = 0;
+    if ( (fp = fopen("timer.flag", "r")) != NULL ) {
+      fclose(fp);
+      timer_on = 1;
+    }
+    timer_clear( 0 );
+    if ( timer_on ) {
+      timer_clear( 1 );
+      timer_clear( 2 );
+      timer_clear( 3 );
+    }
+
+    if ( timer_on ) timer_start( 3 );
+
+
+    /*  Initialize the verification arrays if a valid class */
+    for (i=0; i<TEST_ARRAY_SIZE; i++)
+      switch( CLASS ) {
+        case 'S':
+          test_index_array[i] = S_test_index_array[i];
+          test_rank_array[i]  = S_test_rank_array[i];
+          break;
+        case 'A':
+          test_index_array[i] = A_test_index_array[i];
+          test_rank_array[i]  = A_test_rank_array[i];
+          break;
+        case 'W':
+          test_index_array[i] = W_test_index_array[i];
+          test_rank_array[i]  = W_test_rank_array[i];
+          break;
+        case 'B':
+          test_index_array[i] = B_test_index_array[i];
+          test_rank_array[i]  = B_test_rank_array[i];
+          break;
+        case 'C':
+          test_index_array[i] = C_test_index_array[i];
+          test_rank_array[i]  = C_test_rank_array[i];
+          break;
+        case 'D':
+          test_index_array[i] = D_test_index_array[i];
+          test_rank_array[i]  = D_test_rank_array[i];
+          break;
+      };
+
+
+
+      /*  Printout initial NPB info */
+      printf( "\n\n NAS Parallel Benchmarks 4.0 OpenMP C++ version - IS Benchmark\n\n" );
+      printf( "\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n" );
+      printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
+      printf( " Iterations:  %d\n", MAX_ITERATIONS );
+      printf( " Number of available threads:  %d\n", std::thread::hardware_concurrency() );
+      printf( "\n" );
+
+      #ifdef USE_BUCKETS
+        printf(" This Version of the Benchmark DOES USE Buckets\n\n");
+      #else
+        printf(" This Version of the Benchmark DOES NOT USE Buckets\n\n");
+      #endif
+
+      if ( timer_on ) timer_start( 1 );
+    }
+
+  alloc_key_buff();
+
+  /*  Generate random number sequence and subsequent keys on all procs */
+  create_seq( 314159265.00,                    /* Random number gen seed */
+              1220703125.00 );                 /* Random number gen mult */
+
+  if ( 0 == dash::myid() ) {
+    if ( timer_on ) timer_stop( 1 );
+  }
+
+
+  /*  Do one interation for free (i.e., untimed) to guarantee initialization of
+  all data and code pages and respective tables */
+  rank( 1 );
+
+  /*  Start verification counter */
+  passed_verification = 0;
+
+  if ( 0 == dash::myid() ) if ( CLASS != 'S' ) printf( "\n   iteration\n" );
+
+  /*  Start timer  */
+  if ( 0 == dash::myid() ) timer_start( 0 );
+
+  /*  This is the main iteration */
+  for (iteration=1; iteration<=MAX_ITERATIONS; iteration++) {
+
+    if ( 0 == dash::myid() ) if ( CLASS != 'S' ) printf( "        %d\n", iteration );
+    rank( iteration );
+    dash::barrier();
+  }
+
+  if ( 0 == dash::myid() ) {
+
+    /*  End of timing, obtain maximum time of all processors */
+    timer_stop( 0 );
+
+    timecounter = timer_read( 0 );
+
+
+    /*  This tests that keys are in sequence: sorting of last ranked key seq
+    occurs here, but is an untimed operation                             */
+    if ( timer_on ) timer_start( 2 );
+  }
+
+  full_verify();
+
+  if ( 0 == dash::myid() ) {
+    if ( timer_on ) timer_stop( 2 );
+
+    if ( timer_on ) timer_stop( 3 );
+
+    /*  The final printout  */
+    if ( passed_verification != 5*MAX_ITERATIONS + 1 )
+      passed_verification = 0;
+
+    /*c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS))
+    /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC,
+    CFLAGS, CLINKFLAGS );*/
+    c_print_results( (char*)"IS", CLASS, TOTAL_KEYS, 0, 0, MAX_ITERATIONS, nthreads, timecounter,
+                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))/timecounter/1000000.0, (char*)"keys ranked", passed_verification,
+                     (char*)NPBVERSION, (char*)COMPILETIME, (char*)CC, (char*)CLINK, (char*)C_LIB, (char*)C_INC, (char*)CFLAGS, (char*)CLINKFLAGS, (char*)"randlc");
+
+    /*  Print additional timers  */
+    if ( timer_on ) {
+      double t_total, t_percent;
+
+      t_total = timer_read( 3 );
+      printf("\nAdditional timers -\n");
+      printf(" Total execution: %8.3f\n", t_total);
+      if ( t_total == 0.0 ) t_total = 1.0;
+      timecounter = timer_read(1);
+      t_percent = timecounter/t_total * 100.;
+      printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
+      timecounter = timer_read(0);
+      t_percent = timecounter/t_total * 100.;
+      printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
+      timecounter = timer_read(2);
+      t_percent = timecounter/t_total * 100.;
+      printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
+    }
+
+  }
+
+  dash::finalize();
+
+  return 0;
+}
+/**************************/
+/*  E N D  P R O G R A M  */
+/**************************/
diff --git a/NPB/MG/Makefile b/NPB/MG/Makefile
new file mode 100644
index 0000000..2bcf6b1
--- /dev/null
+++ b/NPB/MG/Makefile
@@ -0,0 +1,20 @@
+SHELL=/bin/sh
+BENCHMARK=mg
+BENCHMARKU=MG
+
+include ../config/make.def
+
+OBJS = mg.o ${COMMON}/c_print_results.o  \
+       ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} -o ${PROGRAM} ${OBJS} ${C_LIB} ${CLINKFLAGS}
+
+mg.o:		mg.cpp npbparams.hpp
+	${CCOMPILE} mg.cpp
+
+clean:
+	- rm -f *.o *~
+	- rm -f npbparams.hpp core
diff --git a/NPB/MG/globals.hpp b/NPB/MG/globals.hpp
new file mode 100644
index 0000000..f1ea027
--- /dev/null
+++ b/NPB/MG/globals.hpp
@@ -0,0 +1,45 @@
+/*--------------------------------------------------------------------
+c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
+c  the edge size max for the partition on a given node, so must be changed 
+c  either to save space (if running a small case) or made bigger for larger 
+c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
+c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
+c  automatically in npbparams.h
+c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
+c-------------------------------------------------------------------*/
+
+#include "npbparams.hpp"
+
+/* parameters */
+/* actual dimension including ghost cells for communications */
+#define	NM	(2+(2<<(LM-1)))
+/* size of rhs array */
+#define	NV	(2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1))))
+/* size of residual array */
+#define	NR	((8*(NV+(NM*NM)+5*NM+7*LM))/7)
+/* size of communication buffer */
+#define	NM2	(2*NM*NM)
+/* maximum number of levels */
+#define	MAXLEVEL	11
+
+#define TIMERS_ENABLED	FALSE
+
+/*---------------------------------------------------------------------*/
+/* common /mg3/ */
+static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1];
+/* common /ClassType/ */
+static char class_npb;
+/* common /my_debug/ */
+static int debug_vec[8];
+/* common /fap/ */
+/*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/
+static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1];
+static int lt, lb;
+
+/*c---------------------------------------------------------------------
+c  Set at m=1024, can handle cases up to 1024^3 case
+c---------------------------------------------------------------------*/
+#define	M	1037
+
+/* common /buffer/ */
+/*static double buff[4][NM2];*/
diff --git a/NPB/MG/mg.cpp b/NPB/MG/mg.cpp
new file mode 100644
index 0000000..61f7569
--- /dev/null
+++ b/NPB/MG/mg.cpp
@@ -0,0 +1,1600 @@
+/*--------------------------------------------------------------------
+  Information on NAS Parallel Benchmarks is available at:
+  http://www.nas.nasa.gov/Software/NPB/
+  Authors: E. Barszcz
+    P. Frederickson
+    A. Woo
+    M. Yarrow
+  DASH version:
+  Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>
+  CPP and OpenMP version:
+      Dalvan Griebler <dalvangriebler@gmail.com>
+      Júnior Löff <loffjh@gmail.com>
+--------------------------------------------------------------------*/
+#include <libdash.h>
+
+#include <vector>
+#include <iostream>
+#include "npb-CPP.hpp"
+
+#include "globals.hpp"
+
+/* parameters */
+#define T_BENCH 1
+#define T_INIT 2
+#define T_STL 3
+
+/* global variables */
+/* common /grid/ */
+static int is1, is2, is3, ie1, ie2, ie3;
+
+/* functions prototypes */
+static void setup(int *n1, int *n2, int *n3, int lt);
+static void mg3P(std::vector<dash::NArray<double, 3> > &u, dash::NArray<double, 3> &v, std::vector<dash::NArray<double, 3> > &r, double a[4], double c[4], int n1, int n2, int n3, int k);
+static void psinv(dash::NArray<double, 3> &r, dash::NArray<double, 3> &u, int n1, int n2, int n3, double c[4], int k);
+static void resid(dash::NArray<double, 3> &u, dash::NArray<double, 3> &v, dash::NArray<double, 3> &r, int n1, int n2, int n3, double a[4], int k);
+static void rprj3(dash::NArray<double, 3> &r, int m1k, int m2k, int m3k, dash::NArray<double, 3> &s, int m1j, int m2j, int m3j, int k);
+static void interp(dash::NArray<double, 3> &z, int mm1, int mm2, int mm3, dash::NArray<double, 3> &u, int n1, int n2, int n3, int k);
+static void norm2u3(dash::NArray<double, 3> &r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz);
+static void rep_nrm(dash::NArray<double, 3> &u, int n1, int n2, int n3, char *title, int kk);
+static void comm3(dash::NArray<double, 3> &u, int n1, int n2, int n3);
+static void zran3(dash::NArray<double, 3> &z, int n1, int n2, int n3, int nx, int ny, int k);
+static void showall(dash::NArray<double, 3> &z, int n1, int n2, int n3);
+static double power(double a, int n);
+static void bubble(double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind);
+static void zero3(dash::NArray<double, 3> &z, int n1, int n2, int n3);
+/*static void nonzero(double ***z, int n1, int n2, int n3);*/
+
+/*--------------------------------------------------------------------
+  program mg
+c-------------------------------------------------------------------*/
+
+int main(int argc, char *argv[]) {
+
+  dash::init(&argc, &argv);
+  /*-------------------------------------------------------------------------
+  c k is the current level. It is passed down through subroutine args
+  c and is NOT global. it is the current iteration
+  c------------------------------------------------------------------------*/
+
+  int k, it;
+  double t, tinit, mflops;
+  int nthreads = 1;
+
+  /*-------------------------------------------------------------------------
+  c These arrays are in common because they are quite large
+  c and probably shouldn't be allocated on the stack. They
+  c are always passed as subroutine args.
+  c------------------------------------------------------------------------*/
+
+  auto distspec = dash::DistributionSpec<3>(dash::BLOCKED, dash::NONE , dash::NONE);
+  std::vector<dash::NArray<double, 3> > u;
+  dash::NArray<double, 3> v;
+  std::vector<dash::NArray<double, 3> > r;
+
+  double a[4], c[4];
+
+  double rnm2, rnmu;
+  double epsilon = 1.0e-8;
+  int n1, n2, n3, nit;
+  double verify_value;
+  boolean verified;
+
+  int i, j, l;
+  FILE *fp;
+
+  timer_clear(T_BENCH);
+  timer_clear(T_INIT);
+  timer_clear(T_STL);
+
+  nthreads = dash::size();
+
+  timer_start(T_INIT);
+
+  /*----------------------------------------------------------------------
+  c Read in and broadcast input data
+  c---------------------------------------------------------------------*/
+  if ( dash::myid() == 0 ) {
+    printf("\n\n NAS Parallel Benchmarks 4.0 C++ DASH version" " - MG Benchmark\n\n");
+    printf("\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n");
+    printf("\n\n DASH version by: Nicco Mietzsch <nicco.mietzsch@campus.lmu.de>\n");
+  }
+
+  fp = fopen("mg.input", "r");
+  if ( fp != NULL ) {
+    printf(" Reading from input file mg.input\n");
+    if ( fscanf(fp, "%d", &lt) != 1 ){
+      printf(" Error in reading elements\n");
+      exit(1);
+    }
+    while (fgetc(fp) != '\n');
+    if ( fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]) != 3 ){
+      printf(" Error in reading elements\n");
+      exit(1);
+    }
+    while (fgetc(fp) != '\n');
+    if ( fscanf(fp, "%d", &nit) != 1 ){
+      printf(" Error in reading elements\n");
+      exit(1);
+    }
+    while (fgetc(fp) != '\n');
+    for (i = 0; i <= 7; i++) {
+      if ( fscanf(fp, "%d", &debug_vec[i]) != 1 ){
+        printf(" Error in reading elements\n");
+        exit(1);
+      }
+    }
+    fclose(fp);
+  } else {
+    if ( dash::myid() == 0 ) printf(" No input file. Using compiled defaults\n");
+
+    lt = LT_DEFAULT;
+    nit = NIT_DEFAULT;
+    nx[lt] = NX_DEFAULT;
+    ny[lt] = NY_DEFAULT;
+    nz[lt] = NZ_DEFAULT;
+
+    for (i = 0; i <= 7; i++) {
+      debug_vec[i] = DEBUG_DEFAULT;
+    }
+  }
+
+  if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) {
+    class_npb = 'U';
+  } else if ( nx[lt] == 32 && nit == 4 ) {
+    class_npb = 'S';
+  } else if ( nx[lt] == 64 && nit == 40 ) {
+    class_npb = 'W';
+  } else if ( nx[lt] == 256 && nit == 20 ) {
+    class_npb = 'B';
+  } else if ( nx[lt] == 512 && nit == 20 ) {
+    class_npb = 'C';
+  } else if ( nx[lt] == 256 && nit == 4 ) {
+    class_npb = 'A';
+  } else {
+    class_npb = 'U';
+  }
+
+  /*--------------------------------------------------------------------
+  c  Use these for debug info:
+  c---------------------------------------------------------------------
+  c   debug_vec(0) = 1 !=> report all norms
+  c   debug_vec(1) = 1 !=> some setup information
+  c   debug_vec(1) = 2 !=> more setup information
+  c   debug_vec(2) = k => at level k or below, show result of resid
+  c   debug_vec(3) = k => at level k or below, show result of psinv
+  c   debug_vec(4) = k => at level k or below, show result of rprj
+  c   debug_vec(5) = k => at level k or below, show result of interp
+  c   debug_vec(6) = 1 => (unused)
+  c   debug_vec(7) = 1 => (unused)
+  c-------------------------------------------------------------------*/
+
+  a[0] = -8.0/3.0;
+  a[1] =  0.0;
+  a[2] =  1.0/6.0;
+  a[3] =  1.0/12.0;
+
+  if ( class_npb == 'A' || class_npb == 'S' || class_npb =='W' ) {
+    /*--------------------------------------------------------------------
+    c  Coefficients for the S(a) smoother
+    c-------------------------------------------------------------------*/
+    c[0] =  -3.0/8.0;
+    c[1] =  1.0/32.0;
+    c[2] =  -1.0/64.0;
+    c[3] =   0.0;
+  } else {
+    /*--------------------------------------------------------------------
+    c   Coefficients for the S(b) smoother
+    c-------------------------------------------------------------------*/
+    c[0] =  -3.0/17.0;
+    c[1] =  1.0/33.0;
+    c[2] =  -1.0/61.0;
+    c[3] =   0.0;
+  }
+
+  lb = 1;
+
+  setup(&n1,&n2,&n3,lt);
+
+  u.resize(lt+1);
+  for (l = lt; l >=1; l--) {
+    u[l].allocate(m3[l],m2[l],m1[l], distspec);
+  }
+
+  v.allocate(m3[lt],m2[lt],m1[lt], distspec);
+
+  r.resize(lt+1);
+  for (l = lt; l >=1; l--) {
+    r[l].allocate(m3[l],m2[l],m1[l], distspec);
+  }
+
+  zero3(u[lt],n1,n2,n3);
+  zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);
+
+  norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);
+
+  if ( dash::myid() == 0 ) {
+    /*printf("\n norms of random v are\n");
+    printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu);
+    printf(" about to evaluate resid, k= %d\n", lt);*/
+
+    printf(" Size: %3dx%3dx%3d (class_npb %1c)\n", nx[lt], ny[lt], nz[lt], class_npb);
+    printf(" Iterations: %3d\n", nit);
+  }
+
+  resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
+  norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);
+
+  /*c---------------------------------------------------------------------
+  c   One iteration for startup
+  c---------------------------------------------------------------------*/
+  mg3P(u,v,r,a,c,n1,n2,n3,lt);
+  resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
+
+  setup(&n1,&n2,&n3,lt);
+
+  zero3(u[lt],n1,n2,n3);
+  zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);
+
+  if ( dash::myid() == 0 ) {
+    timer_stop(T_INIT);
+
+    timer_clear(T_STL);
+
+    //std::clear();
+
+    timer_start(T_BENCH);
+  }
+
+  resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
+  norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);
+
+  for (it = 1; it <= nit; it++) {
+    mg3P(u,v,r,a,c,n1,n2,n3,lt);
+    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
+  }
+  norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);
+
+  if ( dash::myid() == 0 ) {
+    timer_stop(T_BENCH);
+
+    t = timer_read(T_BENCH);
+    tinit = timer_read(T_INIT);
+
+    verified = FALSE;
+    verify_value = 0.0;
+
+    printf(" Initialization time: %15.3f seconds\n", tinit);
+    printf(" Benchmark completed\n");
+
+    if ( class_npb != 'U' ) {
+      if ( class_npb == 'S' ) {
+          verify_value = 0.530770700573e-04;
+      } else if ( class_npb == 'W' ) {
+          verify_value = 0.250391406439e-17;  /* 40 iterations*/
+        /*  0.183103168997d-044 iterations*/
+      } else if ( class_npb == 'A' ) {
+          verify_value = 0.2433365309e-5;
+        } else if ( class_npb == 'B' ) {
+          verify_value = 0.180056440132e-5;
+        } else if ( class_npb == 'C' ) {
+          verify_value = 0.570674826298e-06;
+      }
+
+      if ( fabs( rnm2 - verify_value ) <= epsilon ) {
+        verified = TRUE;
+        printf(" VERIFICATION SUCCESSFUL\n");
+        printf(" L2 Norm is %20.12e\n", rnm2);
+        printf(" Error is   %20.12e\n", rnm2 - verify_value);
+      } else {
+        verified = FALSE;
+        printf(" VERIFICATION FAILED\n");
+        printf(" L2 Norm is       %20.12e\n", rnm2);
+        printf(" The correct L2 Norm is %20.12e\n", verify_value);
+      }
+    } else {
+      verified = FALSE;
+      printf(" Problem size unknown\n");
+      printf(" NO VERIFICATION PERFORMED\n");
+    }
+
+    if ( t != 0.0 ) {
+      int nn = nx[lt]*ny[lt]*nz[lt];
+      mflops = 58.*nit*nn*1.0e-6 / t;
+    } else {
+      mflops = 0.0;
+    }
+
+    c_print_results((char*)"MG", class_npb, nx[lt], ny[lt], nz[lt], nit, nthreads, t, mflops, (char*)"      floating point",
+        verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
+    if ( TIMERS_ENABLED == TRUE ) printf(" time spent in STL: %15.3f seconds\n", timer_read(T_STL));
+
+  }
+
+  dash::finalize();
+
+  return 0;
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void setup(int *n1, int *n2, int *n3, int lt) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  int k;
+
+  for (k = lt-1; k >= 1; k--) {
+    nx[k] = nx[k+1]/2;
+    ny[k] = ny[k+1]/2;
+    nz[k] = nz[k+1]/2;
+  }
+
+  for (k = 1; k <= lt; k++) {
+    m1[k] = nx[k]+2;
+    m2[k] = nz[k]+2;
+    m3[k] = ny[k]+2;
+  }
+
+  is1 = 1;
+  ie1 = nx[lt];
+  *n1 = nx[lt]+2;
+  is2 = 1;
+  ie2 = ny[lt];
+  *n2 = ny[lt]+2;
+  is3 = 1;
+  ie3 = nz[lt];
+  *n3 = nz[lt]+2;
+
+  if ( debug_vec[1] >=  1 && dash::myid() == 0 ) {
+    printf(" in setup, \n");
+    printf("  lt  nx  ny  nz  n1  n2  n3 is1 is2 is3 ie1 ie2 ie3\n");
+    printf("%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d\n",lt,nx[lt],ny[lt],nz[lt],*n1,*n2,*n3,is1,is2,is3,ie1,ie2,ie3);
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void mg3P(std::vector<dash::NArray<double, 3> > &u, dash::NArray<double, 3> &v, std::vector<dash::NArray<double, 3> > &r, double a[4], double c[4], int n1, int n2, int n3, int k) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c multigrid V-cycle routine
+  c-------------------------------------------------------------------*/
+
+  int j;
+
+  /*--------------------------------------------------------------------
+  c   down cycle.
+  c   restrict the residual from the find grid to the coarse
+  c-------------------------------------------------------------------*/
+
+  for (k = lt; k >= lb+1; k--) {
+    j = k-1;
+    rprj3(r[k], m1[k], m2[k], m3[k], r[j], m1[j], m2[j], m3[j], k);
+  }
+
+  k = lb;
+  /*--------------------------------------------------------------------
+  c   compute an approximate solution on the coarsest grid
+  c-------------------------------------------------------------------*/
+  zero3(u[k], m1[k], m2[k], m3[k]);
+  psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);
+
+  for (k = lb+1; k <= lt-1; k++) {
+    j = k-1;
+    /*--------------------------------------------------------------------
+    c prolongate from level k-1  to k
+    c-------------------------------------------------------------------*/
+    zero3(u[k], m1[k], m2[k], m3[k]);
+    interp(u[j], m1[j], m2[j], m3[j], u[k], m1[k], m2[k], m3[k], k);
+    /*--------------------------------------------------------------------
+    c compute residual for level k
+    c-------------------------------------------------------------------*/
+    resid(u[k], r[k], r[k], m1[k], m2[k], m3[k], a, k);
+    /*--------------------------------------------------------------------
+    c apply smoother
+    c-------------------------------------------------------------------*/
+    psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);
+  }
+
+  j = lt - 1;
+  k = lt;
+  interp(u[j], m1[j], m2[j], m3[j], u[lt], n1, n2, n3, k);
+  resid(u[lt], v, r[lt], n1, n2, n3, a, k);
+  psinv(r[lt], u[lt], n1, n2, n3, c, k);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void psinv(dash::NArray<double, 3> &r, dash::NArray<double, 3> &u, int n1, int n2, int n3, double c[4], int k) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   psinv applies an approximate inverse as smoother:  u = u + Cr
+  c
+  c   This  implementation costs  15A + 4M per result, where
+  c   A and M denote the costs of Addition and Multiplication.
+  c   Presuming coefficient c(3) is zero (the NPB assumes this,
+  c   but it is thus not a general case), 2A + 1M may be eliminated,
+  c   resulting in 13A + 3M.
+  c   Note that this vectorizes, and is also fine for cache
+  c   based machines.
+  c-------------------------------------------------------------------*/
+  double r1[n1], r2[n1];
+
+  int myid = dash::myid();
+
+  auto pattern = u.pattern();
+  auto local_beg_gidx = pattern.coords(pattern.global(0));
+  auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+  int topcoord = local_beg_gidx[0]-1;
+  int bottomcoord = local_end_gidx[0]+1;
+  int z_ext = u.local.extent(0);
+
+  // double topplane[n2][n1];
+  // double bottomplane[n2][n1];
+  std::vector<double> topplane(n2*n1);
+  std::vector<double> bottomplane(n2*n1);
+
+  dash::Future<double*> fut_top;
+  dash::Future<double*> fut_bot;
+
+  if ( topcoord >= 0 && z_ext > 0 ) {
+    fut_top = dash::copy_async(r.begin()+n2*n1*topcoord, r.begin()+n2*n1*(topcoord+1), &topplane[0]);
+  }
+
+  if ( bottomcoord < n3 && z_ext > 0 ) {
+    fut_bot = dash::copy_async(r.begin()+n2*n1*bottomcoord, r.begin()+n2*n1*(bottomcoord+1), &bottomplane[0]);
+  }
+
+  for (int i3 = 1; i3 < z_ext-1; i3++) {
+    for (int i2 = 1; i2 < n2-1; i2++) {
+      for (int i1 = 0; i1 < n1; i1++) {
+        r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + r.local(i3-1,i2,i1) + r.local(i3+1,i2,i1);
+        r2[i1] = r.local(i3-1,i2-1,i1) + r.local(i3-1,i2+1,i1) + r.local(i3+1,i2-1,i1) + r.local(i3+1,i2+1,i1);
+      }
+      for (int i1 = 1; i1 < n1-1; i1++) {
+        u.local(i3,i2,i1) = u.local(i3,i2,i1)
+        + c[0] * r.local(i3,i2,i1)
+        + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] )
+        + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] );
+        //--------------------------------------------------------------------
+        //c  Assume c(3) = 0  (Enable line below if c(3) not= 0)
+        //c-------------------------------------------------------------------
+        //c > + c(3) * ( r2(i1-1) + r2(i1+1) )
+        //c-------------------------------------------------------------------
+      }
+    }
+  }
+
+  if ( z_ext > 1 ) {
+    if ( topcoord >= 0 ) {
+      int i3 = 0;
+      fut_top.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + topplane[i2*n2+i1] + r.local(i3+1,i2,i1);
+          r2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + r.local(i3+1,i2-1,i1) + r.local(i3+1,i2+1,i1);
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          u.local(i3,i2,i1) = u.local(i3,i2,i1)
+          + c[0] * r.local(i3,i2,i1)
+          + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] )
+          + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] );
+          //--------------------------------------------------------------------
+          //c  Assume c(3) = 0  (Enable line below if c(3) not= 0)
+          //c-------------------------------------------------------------------
+          //c > + c(3) * ( r2(i1-1) + r2(i1+1) )
+          //c-------------------------------------------------------------------
+        }
+      }
+    }
+    if ( bottomcoord < n3 ) {
+      int i3 = z_ext-1;
+      fut_bot.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + r.local(i3-1,i2,i1) + bottomplane[i2*n2+i1];
+          r2[i1] = r.local(i3-1,i2-1,i1) + r.local(i3-1,i2+1,i1) + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1];
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          u.local(i3,i2,i1) = u.local(i3,i2,i1)
+          + c[0] * r.local(i3,i2,i1)
+          + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] )
+          + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] );
+          //--------------------------------------------------------------------
+          //c  Assume c(3) = 0  (Enable line below if c(3) not= 0)
+          //c-------------------------------------------------------------------
+          //c > + c(3) * ( r2(i1-1) + r2(i1+1) )
+          //c-------------------------------------------------------------------
+        }
+      }
+    }
+  } else {
+    if ( 0 <= topcoord && bottomcoord < n3 ) {
+      int i3 = 0;
+      fut_top.wait();
+      fut_bot.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          r1[i1] = r.local(i3,i2-1,i1) + r.local(i3,i2+1,i1) + topplane[i2*n2+i1] + bottomplane[i2*n2+i1];
+          r2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1];
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          u.local(i3,i2,i1) = u.local(i3,i2,i1)
+          + c[0] * r.local(i3,i2,i1)
+          + c[1] * ( r.local(i3,i2,i1-1) + r.local(i3,i2,i1+1) + r1[i1] )
+          + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] );
+          //--------------------------------------------------------------------
+          //c  Assume c(3) = 0  (Enable line below if c(3) not= 0)
+          //c-------------------------------------------------------------------
+          //c > + c(3) * ( r2(i1-1) + r2(i1+1) )
+          //c-------------------------------------------------------------------
+        }
+      }
+    }
+  }
+    /*--------------------------------------------------------------------
+    c   exchange boundary points
+    c-------------------------------------------------------------------*/
+    comm3(u,n1,n2,n3);
+
+    if ( debug_vec[0] >= 1 ) {
+      rep_nrm(u,n1,n2,n3,(char*)"   psinv",k);
+    }
+
+    if ( debug_vec[3] >= k ) {
+      showall(u,n1,n2,n3);
+    }
+  }
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void resid(dash::NArray<double, 3> &u, dash::NArray<double, 3> &v, dash::NArray<double, 3> &r, int n1, int n2, int n3, double a[4], int k) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   resid computes the residual:  r = v - Au
+  c
+  c   This  implementation costs  15A + 4M per result, where
+  c   A and M denote the costs of Addition (or Subtraction) and
+  c   Multiplication, respectively.
+  c   Presuming coefficient a(1) is zero (the NPB assumes this,
+  c   but it is thus not a general case), 3A + 1M may be eliminated,
+  c   resulting in 12A + 3M.
+  c   Note that this vectorizes, and is also fine for cache
+  c   based machines.
+  c-------------------------------------------------------------------*/
+  // double u1[n1], u2[n1];
+  std::vector<double> u1(n1);
+  std::vector<double> u2(n1);
+
+  int myid = dash::myid();
+
+  auto pattern = r.pattern();
+  auto local_beg_gidx = pattern.coords(pattern.global(0));
+  auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+  int topcoord = local_beg_gidx[0]-1;
+  int bottomcoord = local_end_gidx[0]+1;
+  int z_ext = u.local.extent(0);
+
+  std::vector<double> topplane(n2*n1);
+  std::vector<double> bottomplane(n2*n1);
+
+  dash::Future<double*> fut_top;
+  dash::Future<double*> fut_bot;
+
+  if ( topcoord >= 0 && z_ext > 0 ) {
+    fut_top = dash::copy_async(u.begin()+n2*n1*topcoord, u.begin()+n2*n1*(topcoord+1), &topplane[0]);
+  }
+
+  if ( bottomcoord < n3 && z_ext > 0 ) {
+    fut_bot = dash::copy_async(u.begin()+n2*n1*bottomcoord, u.begin()+n2*n1*(bottomcoord+1), &bottomplane[0]);
+  }
+
+  for (int i3 = 1; i3 < z_ext-1; i3++) {
+    for (int i2 = 1; i2 < n2-1; i2++) {
+      for (int i1 = 0; i1 < n1; i1++) {
+        u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + u.local(i3-1,i2,i1) + u.local(i3+1,i2,i1);
+        u2[i1] = u.local(i3-1,i2-1,i1) + u.local(i3-1,i2+1,i1) + u.local(i3+1,i2-1,i1) + u.local(i3+1,i2+1,i1);
+      }
+      for (int i1 = 1; i1 < n1-1; i1++) {
+        r.local(i3,i2,i1) = v.local(i3,i2,i1)
+         - a[0] * u.local(i3,i2,i1)
+        //--------------------------------------------------------------------
+        //c  Assume a(1) = 0    (Enable 2 lines below if a(1) not= 0)
+        //c-------------------------------------------------------------------
+        //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+        //c > + u1(i1) )
+        //c-------------------------------------------------------------------
+         - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] )
+         - a[3] * ( u2[i1-1] + u2[i1+1] );
+      }
+    }
+  }
+
+  if ( z_ext > 1 ) {
+    if ( topcoord >= 0 ) {
+      int i3 = 0;
+      fut_top.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + topplane[i2*n2+i1] + u.local(i3+1,i2,i1);
+          u2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + u.local(i3+1,i2-1,i1) + u.local(i3+1,i2+1,i1);
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          r.local(i3,i2,i1) = v.local(i3,i2,i1)
+           - a[0] * u.local(i3,i2,i1)
+          //--------------------------------------------------------------------
+          //c  Assume a(1) = 0    (Enable 2 lines below if a(1) not= 0)
+          //c-------------------------------------------------------------------
+          //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+          //c > + u1(i1) )
+          //c-------------------------------------------------------------------
+           - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] )
+           - a[3] * ( u2[i1-1] + u2[i1+1] );
+        }
+      }
+    }
+    if ( bottomcoord < n3 ) {
+      int i3 = z_ext-1;
+      fut_bot.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + u.local(i3-1,i2,i1) + bottomplane[i2*n2+i1];
+          u2[i1] = u.local(i3-1,i2-1,i1) + u.local(i3-1,i2+1,i1) + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1];
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          r.local(i3,i2,i1) = v.local(i3,i2,i1)
+           - a[0] * u.local(i3,i2,i1)
+          //--------------------------------------------------------------------
+          //c  Assume a(1) = 0    (Enable 2 lines below if a(1) not= 0)
+          //c-------------------------------------------------------------------
+          //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+          //c > + u1(i1) )
+          //c-------------------------------------------------------------------
+           - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] )
+           - a[3] * ( u2[i1-1] + u2[i1+1] );
+        }
+      }
+    }
+  } else {
+    if ( 0 <= topcoord && bottomcoord < n3 ) {
+      int i3 = 0;
+      fut_top.wait();
+      fut_bot.wait();
+
+      for (int i2 = 1; i2 < n2-1; i2++) {
+        for (int i1 = 0; i1 < n1; i1++) {
+          u1[i1] = u.local(i3,i2-1,i1) + u.local(i3,i2+1,i1) + topplane[i2*n2+i1] + bottomplane[i2*n2+i1];
+          u2[i1] = topplane[(i2-1)*n2+i1] + topplane[(i2+1)*n2+i1] + bottomplane[(i2-1)*n2+i1] + bottomplane[(i2+1)*n2+i1];
+        }
+        for (int i1 = 1; i1 < n1-1; i1++) {
+          r.local(i3,i2,i1) = v.local(i3,i2,i1)
+           - a[0] * u.local(i3,i2,i1)
+          //--------------------------------------------------------------------
+          //c  Assume a(1) = 0    (Enable 2 lines below if a(1) not= 0)
+          //c-------------------------------------------------------------------
+          //c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+          //c > + u1(i1) )
+          //c-------------------------------------------------------------------
+           - a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] )
+           - a[3] * ( u2[i1-1] + u2[i1+1] );
+        }
+      }
+    }
+  }
+
+  /*--------------------------------------------------------------------
+  c   exchange boundary data
+  c--------------------------------------------------------------------*/
+  comm3(r,n1,n2,n3);
+
+  if ( debug_vec[0] >= 1 ) {
+    rep_nrm(r,n1,n2,n3,(char*)"   resid",k);
+  }
+
+  if ( debug_vec[2] >= k ) {
+    showall(r,n1,n2,n3);
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void rprj3(dash::NArray<double, 3> &r, int m1k, int m2k, int m3k, dash::NArray<double, 3> &s, int m1j, int m2j, int m3j, int k) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   rprj3 projects onto the next coarser grid,
+  c   using a trilinear Finite Element projection:  s = r' = P r
+  c
+  c   This  implementation costs  20A + 4M per result, where
+  c   A and M denote the costs of Addition and Multiplication.
+  c   Note that this vectorizes, and is also fine for cache
+  c   based machines.
+  c-------------------------------------------------------------------*/
+
+  int d1, d2, d3;
+
+  if ( m1k == 3 ) {
+    d1 = 2;
+  } else {
+    d1 = 1;
+  }
+
+  if ( m2k == 3 ) {
+    d2 = 2;
+  } else {
+    d2 = 1;
+  }
+
+  if ( m3k == 3 ) {
+    d3 = 2;
+  } else {
+    d3 = 1;
+  }
+
+  int j2, j1, i3, i2, i1;
+  double x1[m1k], y1[m1k], x2, y2;
+
+  auto s_pattern = s.pattern();
+  auto s_local_beg_gidx = s_pattern.coords(s_pattern.global(0));
+  auto s_local_end_gidx = s_pattern.coords(s_pattern.global(s_pattern.local_size()-1));
+
+  auto r_pattern = r.pattern();
+  auto r_local_beg_gidx = r_pattern.coords(r_pattern.global(0));
+  auto r_local_end_gidx = r_pattern.coords(r_pattern.global(r_pattern.local_size()-1));
+  int r_offset = r_local_beg_gidx[0];
+  int r_x = (int) r.extent(2);
+  int r_y = (int) r.extent(1);
+
+  int start = 0;
+  if ( s_local_beg_gidx[0] == 0 ) start++;
+
+  int end = s.local.extent(0);
+  if ( s_local_end_gidx[0] == s.extent(0)-1 ) end--;
+
+  int r_planes = 2*(end-start)+1;
+
+  std::vector<std::vector<double> > r_local_v(r_planes);
+
+  for (int i = 0; i < r_planes; i++) {
+    r_local_v[i] = std::vector<double> (r_y*r_x);
+  }
+
+  std::vector<double*> r_local(r_planes);
+  std::vector<bool> is_local(r_planes);
+  int r_idx = 0;
+  int r_psize = r_y*r_x;
+
+  dash::Future<double*> futs[r_planes];
+
+  for (int j3l = start; j3l < end; j3l++){
+    int j3 = s_local_beg_gidx[0]+j3l;
+    i3 = 2*j3-d3;
+    if ( r(i3,0,0).is_local() ) {
+      r_local[r_idx] = r.lbegin()+r_psize*(i3-r_offset);
+      is_local[r_idx] = true;
+    } else {
+      futs[r_idx] = dash::copy_async(r.begin()+r_psize*i3, r.begin()+r_psize*(i3+1), r_local_v[r_idx].data());
+      r_local[r_idx] = r_local_v[r_idx].data();
+    }
+    r_idx++;
+    if ( r(i3+1,0,0).is_local() ) {
+      r_local[r_idx] = r.lbegin()+r_psize*(i3+1-r_offset);
+      is_local[r_idx] = true;
+    } else {
+      futs[r_idx] = dash::copy_async(r.begin()+r_psize*(i3+1), r.begin()+r_psize*(i3+2), r_local_v[r_idx].data());
+      r_local[r_idx] = r_local_v[r_idx].data();
+    }
+    r_idx++;
+  }
+
+
+  if ( start < end ) {
+    i3 = 2*(s_local_beg_gidx[0]+end)-d3;
+    if ( r(i3,0,0).is_local() ) {
+      r_local[r_idx] = r.lbegin()+r_psize*(i3-r_offset);
+      is_local[r_idx] = true;
+    } else {
+      futs[r_idx] = dash::copy_async(r.begin()+r_psize*i3, r.begin()+r_psize*(i3+1), r_local_v[r_idx].data());
+      r_local[r_idx] = r_local_v[r_idx].data();
+    }
+  }
+
+  //First, do the local calculations...
+  r_idx = 0;
+
+  for (int j3l = start; j3l < end; j3l++) {
+    if ( is_local[r_idx+0] && is_local[r_idx+1] && is_local[r_idx+2] ) {
+      int j3 = s_local_beg_gidx[0]+j3l;
+      i3 = 2*j3-d3;
+      //C  i3 = 2*j3-1
+
+      for (j2 = 1; j2 < m2j-1; j2++) {
+        i2 = 2*j2-d2;
+        //C  i2 = 2*j2-1
+
+        for (j1 = 1; j1 < m1j; j1++) {
+          i1 = 2*j1-d1;
+          //C  i1 = 2*j1-1
+          x1[i1] = r_local[r_idx+1][i2*r_x+i1] + r_local[r_idx+1][(i2+2)*r_x+i1] + r_local[r_idx+0][(i2+1)*r_x+i1] + r_local[r_idx+2][(i2+1)*r_x+i1];
+          y1[i1] = r_local[r_idx+0][i2*r_x+i1] + r_local[r_idx+2][i2*r_x+i1] + r_local[r_idx+0][(i2+2)*r_x+i1] + r_local[r_idx+2][(i2+2)*r_x+i1];
+        }
+
+        for (j1 = 1; j1 < m1j-1; j1++) {
+          i1 = 2*j1-d1;
+          //C  i1 = 2*j1-1
+          y2 = r_local[r_idx+0][i2*r_x+i1+1] + r_local[r_idx+2][i2*r_x+i1+1] + r_local[r_idx+0][(i2+2)*r_x+i1+1] + r_local[r_idx+2][(i2+2)*r_x+i1+1];
+          x2 = r_local[r_idx+1][i2*r_x+i1+1] + r_local[r_idx+1][(i2+2)*r_x+i1+1] + r_local[r_idx+0][(i2+1)*r_x+i1+1] + r_local[r_idx+2][(i2+1)*r_x+i1+1];
+          s.local(j3l,j2,j1) =
+            0.5 * r_local[r_idx+1][(i2+1)*r_x+i1+1]
+            + 0.25 * ( r_local[r_idx+1][(i2+1)*r_x+i1] + r_local[r_idx+1][(i2+1)*r_x+i1+2] + x2)
+            + 0.125 * ( x1[i1] + x1[i1+2] + y2)
+            + 0.0625 * ( y1[i1] + y1[i1+2] );
+        }
+      }
+    }
+    r_idx = r_idx + 2;
+  }
+
+  //...then do the non-local calculations.
+  r_idx = 0;
+
+  for (int j3l = start; j3l < end; j3l++) {
+    if ( !(is_local[r_idx+0] && is_local[r_idx+1] && is_local[r_idx+2]) ) {
+      int j3 = s_local_beg_gidx[0]+j3l;
+      i3 = 2*j3-d3;
+      //C  i3 = 2*j3-1
+
+      if(!is_local[r_idx+0]) futs[r_idx+0].wait();
+      if(!is_local[r_idx+1]) futs[r_idx+1].wait();
+      if(!is_local[r_idx+2]) futs[r_idx+2].wait();
+
+      for (j2 = 1; j2 < m2j-1; j2++) {
+        i2 = 2*j2-d2;
+        //C  i2 = 2*j2-1
+
+        for (j1 = 1; j1 < m1j; j1++) {
+          i1 = 2*j1-d1;
+          //C  i1 = 2*j1-1
+          x1[i1] = r_local[r_idx+1][i2*r_x+i1] + r_local[r_idx+1][(i2+2)*r_x+i1] + r_local[r_idx+0][(i2+1)*r_x+i1] + r_local[r_idx+2][(i2+1)*r_x+i1];
+          y1[i1] = r_local[r_idx+0][i2*r_x+i1] + r_local[r_idx+2][i2*r_x+i1] + r_local[r_idx+0][(i2+2)*r_x+i1] + r_local[r_idx+2][(i2+2)*r_x+i1];
+        }
+
+        for (j1 = 1; j1 < m1j-1; j1++) {
+          i1 = 2*j1-d1;
+          //C  i1 = 2*j1-1
+          y2 = r_local[r_idx+0][i2*r_x+i1+1] + r_local[r_idx+2][i2*r_x+i1+1] + r_local[r_idx+0][(i2+2)*r_x+i1+1] + r_local[r_idx+2][(i2+2)*r_x+i1+1];
+          x2 = r_local[r_idx+1][i2*r_x+i1+1] + r_local[r_idx+1][(i2+2)*r_x+i1+1] + r_local[r_idx+0][(i2+1)*r_x+i1+1] + r_local[r_idx+2][(i2+1)*r_x+i1+1];
+          s.local(j3l,j2,j1) =
+            0.5 * r_local[r_idx+1][(i2+1)*r_x+i1+1]
+            + 0.25 * ( r_local[r_idx+1][(i2+1)*r_x+i1] + r_local[r_idx+1][(i2+1)*r_x+i1+2] + x2)
+            + 0.125 * ( x1[i1] + x1[i1+2] + y2)
+            + 0.0625 * ( y1[i1] + y1[i1+2] );
+        }
+      }
+    }
+    r_idx = r_idx + 2;
+  }
+
+  comm3(s,m1j,m2j,m3j);
+
+  if ( debug_vec[0] >= 1 ) {
+    rep_nrm(s,m1j,m2j,m3j,(char*)"   rprj3",k-1);
+  }
+
+  if ( debug_vec[4] >= k ) {
+    showall(s,m1j,m2j,m3j);
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void interp(dash::NArray<double, 3> &z, int mm1, int mm2, int mm3, dash::NArray<double, 3> &u, int n1, int n2, int n3, int k) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   interp adds the trilinear interpolation of the correction
+  c   from the coarser grid to the current approximation:  u = u + Qu'
+  c
+  c   Observe that this  implementation costs  16A + 4M, where
+  c   A and M denote the costs of Addition and Multiplication.
+  c   Note that this vectorizes, and is also fine for cache
+  c   based machines.  Vector machines may get slightly better
+  c   performance however, with 8 separate "do i1" loops, rather than 4.
+  c-------------------------------------------------------------------*/
+
+  int d1, d2, d3, t1, t2, t3;
+
+  /*
+  c note that m = 1037 in globals.h but for this only need to be
+  c 535 to handle up to 1024^3
+  c integer m
+  c parameter( m=535 )
+  */
+
+  if ( n1 != 3 && n2 != 3 && n3 != 3 ) {
+
+    double z1[mm1], z2[mm1], z3[mm1];
+
+    auto pattern = u.pattern();
+    auto local_beg_gidx = pattern.coords(pattern.global(0));
+    auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+    auto z_pattern = z.pattern();
+    auto z_local_beg_gidx = z_pattern.coords(z_pattern.global(0));
+    auto z_local_end_gidx = z_pattern.coords(z_pattern.global(z_pattern.local_size()-1));
+    int z_offset = z_local_beg_gidx[0];
+
+    int u_offset = local_beg_gidx[0];
+
+    int start = 0 - u_offset%2;
+    int u_planes = u.local.extent(0);
+    int z_size = mm2*mm1;
+
+    if ( (int) u_planes > 0 ) {
+      int z_local_size = u_planes-start+((u_planes+start)%2);
+
+      std::vector<std::vector<double> > z_local_v(z_local_size);
+      for (int i = 0; i < z_local_size; i++) {
+        z_local_v[i] = std::vector<double> (mm2*mm1);
+      }
+      std::vector<double*> z_local(z_local_size);
+      std::vector<bool> is_local(z_local_size);
+
+      dash::Future<double*> futs[z_local_size];
+
+      for (int j3 = start; j3 < u_planes; j3+=2) {
+        int i3 = (u_offset + j3)/2;
+        if ( z(i3,0,0).is_local() ) {
+          z_local[j3-start] = z.lbegin()+z_size*(i3-z_offset);
+          is_local[j3-start] = true;
+        } else {
+          futs[j3-start] = dash::copy_async(z.begin()+z_size*i3, z.begin()+z_size*(i3+1), z_local_v[j3-start].data());
+          z_local[j3-start] = z_local_v[j3-start].data();
+        }
+        if ( z(i3+1,0,0).is_local() ) {
+          z_local[j3-start+1] = z.lbegin()+z_size*(i3+1-z_offset);
+          is_local[j3-start+1] = true;
+        } else {
+            futs[j3-start+1] = dash::copy_async(z.begin()+z_size*(i3+1), z.begin()+z_size*(i3+2), z_local_v[j3-start+1].data());
+            z_local[j3-start+1] = z_local_v[j3-start+1].data();
+        }
+      }
+
+      //First, do the local calculations...
+      for (int j3 = start; j3 < u_planes; j3+=2) {
+        // int i3 = (u_offset + j3)/2;
+        if ( is_local[j3-start] && is_local[j3-start+1] ) {
+
+          for (int i2 = 0; i2 < mm2-1; i2++) {
+            for (int i1 = 0; i1 < mm1; i1++) {
+              z1[i1] = z_local[j3-start][(i2+1)*mm2+i1] + z_local[j3-start][i2*mm2+i1];
+              z2[i1] = z_local[j3-start+1][i2*mm2+i1] + z_local[j3-start][i2*mm2+i1];
+              z3[i1] = z_local[j3-start+1][(i2+1)*mm2+i1] + z_local[j3-start+1][i2*mm2+i1] + z1[i1];
+            }
+            if ( j3 >= 0 ) {
+              for (int i1 = 0; i1 < mm1-1; i1++) {
+                u.local(j3,2*i2,2*i1)     = u.local(j3,2*i2,2*i1) + z_local[j3-start][i2*mm2+i1];
+                u.local(j3,2*i2,2*i1+1)   = u.local(j3,2*i2,2*i1+1) + 0.5*(z_local[j3-start][i2*mm2+i1+1]+z_local[j3-start][i2*mm2+i1]);
+                u.local(j3,2*i2+1,2*i1)   = u.local(j3,2*i2+1,2*i1) + 0.5 * z1[i1];
+                u.local(j3,2*i2+1,2*i1+1) = u.local(j3,2*i2+1,2*i1+1) + 0.25*( z1[i1] + z1[i1+1] );
+              }
+            }
+            if ( j3+1 < u_planes ) {
+              for (int i1 = 0; i1 < mm1-1; i1++) {
+                u.local(j3+1,2*i2,2*i1)     = u.local(j3+1,2*i2,2*i1) + 0.5 * z2[i1];
+                u.local(j3+1,2*i2,2*i1+1)   = u.local(j3+1,2*i2,2*i1+1) + 0.25*( z2[i1] + z2[i1+1] );
+                u.local(j3+1,2*i2+1,2*i1)   = u.local(j3+1,2*i2+1,2*i1) + 0.25* z3[i1];
+                u.local(j3+1,2*i2+1,2*i1+1) = u.local(j3+1,2*i2+1,2*i1+1) + 0.125*( z3[i1] + z3[i1+1] );
+              }
+            }
+          }
+        }
+      }
+
+      //...then do the non-local calculations.
+      for (int j3 = start; j3 < u_planes; j3+=2) {
+        if ( !(is_local[j3-start] && is_local[j3-start+1]) ) {
+          // int i3 = (u_offset + j3)/2;
+          if(!is_local[j3-start+0]) futs[j3-start].wait();
+          if(!is_local[j3-start+1]) futs[j3-start+1].wait();
+
+          for (int i2 = 0; i2 < mm2-1; i2++) {
+            for (int i1 = 0; i1 < mm1; i1++) {
+              z1[i1] = z_local[j3-start][(i2+1)*mm2+i1] + z_local[j3-start][i2*mm2+i1];
+              z2[i1] = z_local[j3-start+1][i2*mm2+i1] + z_local[j3-start][i2*mm2+i1];
+              z3[i1] = z_local[j3-start+1][(i2+1)*mm2+i1] + z_local[j3-start+1][i2*mm2+i1] + z1[i1];
+            }
+            if ( j3 >= 0 ) {
+              for (int i1 = 0; i1 < mm1-1; i1++) {
+                u.local(j3,2*i2,2*i1)     = u.local(j3,2*i2,2*i1) + z_local[j3-start][i2*mm2+i1];
+                u.local(j3,2*i2,2*i1+1)   = u.local(j3,2*i2,2*i1+1) + 0.5*(z_local[j3-start][i2*mm2+i1+1]+z_local[j3-start][i2*mm2+i1]);
+                u.local(j3,2*i2+1,2*i1)   = u.local(j3,2*i2+1,2*i1) + 0.5 * z1[i1];
+                u.local(j3,2*i2+1,2*i1+1) = u.local(j3,2*i2+1,2*i1+1) + 0.25*( z1[i1] + z1[i1+1] );
+              }
+            }
+            if ( j3+1 < u_planes ) {
+              for (int i1 = 0; i1 < mm1-1; i1++) {
+                u.local(j3+1,2*i2,2*i1)     = u.local(j3+1,2*i2,2*i1) + 0.5 * z2[i1];
+                u.local(j3+1,2*i2,2*i1+1)   = u.local(j3+1,2*i2,2*i1+1) + 0.25*( z2[i1] + z2[i1+1] );
+                u.local(j3+1,2*i2+1,2*i1)   = u.local(j3+1,2*i2+1,2*i1) + 0.25* z3[i1];
+                u.local(j3+1,2*i2+1,2*i1+1) = u.local(j3+1,2*i2+1,2*i1+1) + 0.125*( z3[i1] + z3[i1+1] );
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } else { //this gets never called for our benchmark sizes, so we didn't parallelize it.
+
+    if ( n1 == 3 ) {
+      d1 = 2;
+      t1 = 1;
+    } else {
+      d1 = 1;
+      t1 = 0;
+    }
+    if ( n2 == 3 ) {
+      d2 = 2;
+      t2 = 1;
+    } else {
+      d2 = 1;
+      t2 = 0;
+    }
+    if ( n3 == 3 ) {
+      d3 = 2;
+      t3 = 1;
+    } else {
+      d3 = 1;
+      t3 = 0;
+    }
+    if ( 0 == dash::myid() ) {
+
+      for (int i3 = d3; i3 <= mm3-1; i3++) {
+        for (int i2 = d2; i2 <= mm2-1; i2++) {
+          for (int i1 = d1; i1 <= mm1-1; i1++) {
+            u[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1] =
+            u[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1]
+            +z[i3-1][i2-1][i1-1];
+          }
+          for (int i1 = 1; i1 <= mm1-1; i1++) {
+            u[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1] =
+            u[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1]
+            +0.5*(z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]);
+          }
+        }
+        for (int i2 = 1; i2 <= mm2-1; i2++) {
+          for (int i1 = d1; i1 <= mm1-1; i1++) {
+            u[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1] =
+            u[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1]
+            +0.5*(z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);
+          }
+          for (int i1 = 1; i1 <= mm1-1; i1++) {
+            u[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1] =
+            u[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1]
+            +0.25*(z[i3-1][i2][i1]+z[i3-1][i2-1][i1] + z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);
+          }
+        }
+      }
+
+      for (int i3 = 1; i3 <= mm3-1; i3++) {
+        for (int i2 = d2; i2 <= mm2-1; i2++) {
+          for (int i1 = d1; i1 <= mm1-1; i1++) {
+            u[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1] =
+            u[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1]
+            +0.5*(z[i3][i2-1][i1-1]+z[i3-1][i2-1][i1-1]);
+          }
+          for (int i1 = 1; i1 <= mm1-1; i1++) {
+            u[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1] =
+            u[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1]
+            +0.25*(z[i3][i2-1][i1]+z[i3][i2-1][i1-1] + z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]);
+          }
+        }
+        for (int i2 = 1; i2 <= mm2-1; i2++) {
+          for (int i1 = d1; i1 <= mm1-1; i1++) {
+            u[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1] =
+            u[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1]
+            +0.25*(z[i3][i2][i1-1]+z[i3][i2-1][i1-1] + z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);
+          }
+          for (int i1 = 1; i1 <= mm1-1; i1++) {
+            u[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1] =
+            u[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1]
+            +0.125*(z[i3][i2][i1]+z[i3][i2-1][i1]
+              +z[i3][i2][i1-1]+z[i3][i2-1][i1-1]
+              +z[i3-1][i2][i1]+z[i3-1][i2-1][i1]
+              +z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);
+          }
+        }
+      }
+
+
+      {
+        if ( debug_vec[0] >= 1 ) {
+          rep_nrm(z,mm1,mm2,mm3,(char*)"z: inter",k-1);
+          rep_nrm(u,n1,n2,n3,(char*)"u: inter",k);
+        }
+        if ( debug_vec[5] >= k ) {
+          showall(z,mm1,mm2,mm3);
+          showall(u,n1,n2,n3);
+        }
+      }
+    }
+  }
+  dash::barrier();
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void norm2u3(dash::NArray<double, 3> &r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   norm2u3 evaluates approximations to the L2 norm and the
+  c   uniform (or L-infinity or Chebyshev) norm, under the
+  c   assumption that the boundaries are periodic or zero.  Add the
+  c   boundaries in with half weight (quarter weight on the edges
+  c   and eighth weight at the corners) for inhomogeneous boundaries.
+  c-------------------------------------------------------------------*/
+
+  double s, a;
+  double tmp;
+  int i3, i2, i1, n;
+  dash::Array<double> p_s(dash::size());
+  dash::Array<double> p_a(dash::size());
+
+  auto pattern = r.pattern();
+  auto local_beg_gidx = pattern.coords(pattern.global(0));
+  auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+  int start = 0;
+  if ( local_beg_gidx[0] == 0 ) start++;
+
+  int end = r.local.extent(0);
+  if ( local_end_gidx[0] == n3-1 ) end--;
+
+  n = nx*ny*nz;
+
+  for (i3 = start; i3 < end; i3++) {
+    for (i2 = 1; i2 < n2-1; i2++) {
+      for (i1 = 1; i1 < n1-1; i1++) {
+        p_s.local[0] = p_s.local[0] + r.local(i3,i2,i1) * r.local(i3,i2,i1);
+        tmp = fabs(r.local(i3,i2,i1));
+        if ( tmp > p_a.local[0] ) p_a.local[0] = tmp;
+      }
+    }
+  }
+
+  dash::barrier();
+  s = dash::reduce(p_s.begin(), p_s.end(), 0.0, std::plus<double>());
+  a = (double) (*dash::max_element(p_a.begin(), p_a.end()));
+  if ( a > *rnmu ) *rnmu = a;
+
+  *rnm2 = sqrt(s/(double)n);
+
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void rep_nrm(dash::NArray<double, 3> &u, int n1, int n2, int n3, char *title, int kk) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   report on norm
+  c-------------------------------------------------------------------*/
+
+  double rnm2, rnmu;
+  norm2u3(u,n1,n2,n3,&rnm2,&rnmu,nx[kk],ny[kk],nz[kk]);
+  if ( dash::myid() == 0 ) printf(" Level%2d in %8s: norms =%21.14e%21.14e\n", kk, title, rnm2, rnmu);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void comm3(dash::NArray<double, 3> &u, int n1, int n2, int n3) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   comm3 organizes the communication on all borders
+  c-------------------------------------------------------------------*/
+  auto pattern = u.pattern();
+  auto local_beg_gidx = pattern.coords(pattern.global(0));
+  auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+  int start = 0;
+  if ( local_beg_gidx[0] == 0 ) start++;
+
+  int end = u.local.extent(0);
+  if ( local_end_gidx[0] == n3-1 ) end--;
+
+  // axis = 1
+
+  for (int i3 = start; i3 < end; i3++) {
+    for (int i2 = 1; i2 < n2-1; i2++) {
+      u.local(i3,i2,n1-1) = u.local(i3,i2,1);
+      u.local(i3,i2,0) = u.local(i3,i2,n1-2);
+    }
+  }
+
+  // axis = 2
+
+  for (int i3 = start; i3 < end; i3++) {
+    for (int i1 = 0; i1 < n1; i1++) {
+      u.local(i3,n2-1,i1) = u.local(i3,1,i1);
+      u.local(i3,0,i1) = u.local(i3,n2-2,i1);
+    }
+  }
+
+  dash::barrier();
+  // axis = 3
+  if ( u(u.extent(0)-1,0,0).is_local() ) {
+    dash::copy(u.begin()+n2*n1*1, u.begin()+n2*n1*2, u.lbegin()+n2*n1*end);
+  }
+
+  if ( u(0,0,0).is_local() ) {
+    dash::copy(u.begin()+n2*n1*(n3-2), u.begin()+n2*n1*(n3-1), u.lbegin());
+  }
+
+  dash::barrier();
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void zran3(dash::NArray<double, 3> &z, int n1, int n2, int n3, int nx, int ny, int k) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   zran3  loads +1 at ten randomly chosen points,
+  c   loads -1 at a different ten random points,
+  c   and zero elsewhere.
+  c-------------------------------------------------------------------*/
+
+  #define MM  10
+  #define  A  pow(5.0,13)
+  #define  X  314159265.e0
+
+  int i0, m0, m1;
+  /*int i1, i2, i3, d1, e1, e2, e3;*/
+  int i1, i2, i3, d1, e2, e3;
+  double xx, x0, x1, a1, a2, ai;
+
+  double ten[MM][2], best;
+  int i, j1[MM][2], j2[MM][2], j3[MM][2];
+
+
+  /*double rdummy;*/
+
+  a1 = power( A, nx );
+  a2 = power( A, nx*ny );
+
+  zero3(z,n1,n2,n3);
+
+  i = is1-1+nx*(is2-1+ny*(is3-1));
+
+  ai = power( A, i );
+  d1 = ie1 - is1 + 1;
+  /*e1 = ie1 - is1 + 2;*/
+  e2 = ie2 - is2 + 2;
+  e3 = ie3 - is3 + 2;
+  x0 = X;
+  /*rdummy = */randlc( &x0, ai );
+  auto pattern = z.pattern();
+  auto local_beg_gidx = pattern.coords(pattern.global(0));
+  auto local_end_gidx = pattern.coords(pattern.global(pattern.local_size()-1));
+
+  int start = 0;
+  if ( local_beg_gidx[0] == 0 ) start++;
+
+  int end = z.local.extent(0);
+  if ( local_end_gidx[0] == z.extent(0)-1 ) end--;
+
+  for (int j = 0; j < (int) local_beg_gidx[0]-1; j++){
+    randlc( &x0, a2 );
+  }
+
+  for (i3 = start; i3 < end; i3++) {
+    x1 = x0;
+    for (i2 = 1; i2 < e2; i2++) {
+      xx = x1;
+      double tmp[d1+1];
+      vranlc( d1, &xx, A, tmp);
+      for (int i = 1; i <= d1; i++) z.local(i3,i2,i) = tmp[i];
+      // vranlc( d1, &xx, A, (double *) &(z[i3][i2][0]));
+      /*rdummy = */randlc( &x1, a1 );
+    }
+    /*rdummy = */randlc( &x0, a2 );
+  }
+
+  dash::barrier();
+  if ( dash::myid() == 0 ) {
+
+  /*--------------------------------------------------------------------
+  c   call comm3(z,n1,n2,n3)
+  c   call showall(z,n1,n2,n3)
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   each processor looks for twenty candidates
+  c-------------------------------------------------------------------*/
+
+  for (i = 0; i < MM; i++) {
+    ten[i][1] = 0.0;
+    j1[i][1] = 0;
+    j2[i][1] = 0;
+    j3[i][1] = 0;
+    ten[i][0] = 1.0;
+    j1[i][0] = 0;
+    j2[i][0] = 0;
+    j3[i][0] = 0;
+  }
+  double current;
+  for (i3 = 1; i3 < n3-1; i3++) {
+    for (i2 = 1; i2 < n2-1; i2++) {
+      for (i1 = 1; i1 < n1-1; i1++) {
+        current = z(i3,i2,i1);
+        if ( current > ten[0][1] ) {
+          ten[0][1] = current;
+          j1[0][1] = i1;
+          j2[0][1] = i2;
+          j3[0][1] = i3;
+          bubble( ten, j1, j2, j3, MM, 1 );
+        }
+        if ( current < ten[0][0] ) {
+          ten[0][0] = current;
+          j1[0][0] = i1;
+          j2[0][0] = i2;
+          j3[0][0] = i3;
+          bubble( ten, j1, j2, j3, MM, 0 );
+        }
+      }
+    }
+  }
+  /*--------------------------------------------------------------------
+  c   Now which of these are globally best?
+  c-------------------------------------------------------------------*/
+  // i1 = MM - 1;
+  // i0 = MM - 1;
+  // int jg[4][MM][2];
+  // for (i = MM - 1 ; i >= 0; i--) {
+  //   best = z[j3[i1][1]][j2[i1][1]][j1[i1][1]];
+  //   if (best == z[j3[i1][1]][j2[i1][1]][j1[i1][1]]) {
+  //     jg[0][i][1] = 0;
+  //     jg[1][i][1] = is1 - 1 + j1[i1][1];
+  //     jg[2][i][1] = is2 - 1 + j2[i1][1];
+  //     jg[3][i][1] = is3 - 1 + j3[i1][1];
+  //     i1 = i1-1;
+  //   } else {
+  //     jg[0][i][1] = 0;
+  //     jg[1][i][1] = 0;
+  //     jg[2][i][1] = 0;
+  //     jg[3][i][1] = 0;
+  //   }
+  //   ten[i][1] = best;
+  //   best = z[j3[i0][0]][j2[i0][0]][j1[i0][0]];
+  //   if (best == z[j3[i0][0]][j2[i0][0]][j1[i0][0]]) {
+  //     jg[0][i][0] = 0;
+  //     jg[1][i][0] = is1 - 1 + j1[i0][0];
+  //     jg[2][i][0] = is2 - 1 + j2[i0][0];
+  //     jg[3][i][0] = is3 - 1 + j3[i0][0];
+  //     i0 = i0-1;
+  //   } else {
+  //     jg[0][i][0] = 0;
+  //     jg[1][i][0] = 0;
+  //     jg[2][i][0] = 0;
+  //     jg[3][i][0] = 0;
+  //   }
+  //   ten[i][0] = best;
+  // }
+  // m1 = i1+1;
+  // m0 = i0+1;
+
+  /* printf(" negative charges at");
+  for (i = 0; i < MM; i++) {
+    if (i%5 == 0) printf("\n");
+    printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]);
+  }
+  printf("\n positive charges at");
+  for (i = 0; i < MM; i++) {
+    if (i%5 == 0) printf("\n");
+    printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]);
+  }
+  printf("\n small random numbers were\n");
+  for (i = MM-1; i >= 0; i--) {
+    printf(" %15.8e", ten[i][0]);
+  }
+  printf("\n and they were found on processor number\n");
+  for (i = MM-1; i >= 0; i--) {
+    printf(" %4d", jg[0][i][0]);
+  }
+  printf("\n large random numbers were\n");
+  for (i = MM-1; i >= 0; i--) {
+    printf(" %15.8e", ten[i][1]);
+  }
+  printf("\n and they were found on processor number\n");
+  for (i = MM-1; i >= 0; i--) {
+    printf(" %4d", jg[0][i][1]);
+  }
+  printf("\n");*/
+}
+  dash::barrier();
+  zero3(z, n1, n2, n3);
+  dash::barrier();
+  if ( dash::myid() == 0 ) {
+
+    for (i = MM-1; i >= m0; i--) {
+      z[j3[i][0]][j2[i][0]][j1[i][0]] = -1.0;
+    }
+    for (i = MM-1; i >= m1; i--) {
+      z[j3[i][1]][j2[i][1]][j1[i][1]] = 1.0;
+    }
+  }
+  dash::barrier();
+  comm3(z,n1,n2,n3);
+
+  /*--------------------------------------------------------------------
+  c   call showall(z,n1,n2,n3)
+  c-------------------------------------------------------------------*/
+
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void showall(dash::NArray<double, 3> &z, int n1, int n2, int n3) {
+  if ( dash::myid() == 0 ) {
+    /*--------------------------------------------------------------------
+    c-------------------------------------------------------------------*/
+
+    int i1,i2,i3;
+    int m1, m2, m3;
+
+    m1 = min(n1,18);
+    m2 = min(n2,14);
+    m3 = min(n3,18);
+
+    printf("\n");
+    for (i3 = 0; i3 < m3; i3++) {
+      for (i1 = 0; i1 < m1; i1++) {
+        for (i2 = 0; i2 < m2; i2++) {
+          printf("%6.3f", (double) z[i3][i2][i1]);
+        }
+        printf("\n");
+      }
+      printf(" - - - - - - - \n");
+    }
+    printf("\n");
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static double power(double a, int n) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   power  raises an integer, disguised as a double
+  c   precision real, to an integer power
+  c-------------------------------------------------------------------*/
+  double aj;
+  int nj;
+  /* double rdummy;*/
+  double power;
+
+  power = 1.0;
+  nj = n;
+  aj = a;
+
+  while (nj != 0) {
+    if ( (nj%2) == 1 ) /*rdummy =  */randlc( &power, aj );
+    /*rdummy = */randlc( &aj, aj );
+    nj = nj/2;
+  }
+
+  return (power);
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void bubble(double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind) {
+
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+
+  /*--------------------------------------------------------------------
+  c   bubble    does a bubble sort in direction dir
+  c-------------------------------------------------------------------*/
+  double temp;
+  int i, j_temp;
+  if ( ind == 1 ) {
+    for (i = 0; i < m-1; i++) {
+      if ( ten[i][ind] > ten[i+1][ind] ) {
+        temp = ten[i+1][ind];
+        ten[i+1][ind] = ten[i][ind];
+        ten[i][ind] = temp;
+
+        j_temp = j1[i+1][ind];
+        j1[i+1][ind] = j1[i][ind];
+        j1[i][ind] = j_temp;
+
+        j_temp = j2[i+1][ind];
+        j2[i+1][ind] = j2[i][ind];
+        j2[i][ind] = j_temp;
+
+        j_temp = j3[i+1][ind];
+        j3[i+1][ind] = j3[i][ind];
+        j3[i][ind] = j_temp;
+      } else {
+        return;
+      }
+    }
+  } else {
+    for (i = 0; i < m-1; i++) {
+      if ( ten[i][ind] < ten[i+1][ind] ) {
+
+        temp = ten[i+1][ind];
+        ten[i+1][ind] = ten[i][ind];
+        ten[i][ind] = temp;
+
+        j_temp = j1[i+1][ind];
+        j1[i+1][ind] = j1[i][ind];
+        j1[i][ind] = j_temp;
+
+        j_temp = j2[i+1][ind];
+        j2[i+1][ind] = j2[i][ind];
+        j2[i][ind] = j_temp;
+
+        j_temp = j3[i+1][ind];
+        j3[i+1][ind] = j3[i][ind];
+        j3[i][ind] = j_temp;
+      } else {
+        return;
+      }
+    }
+  }
+}
+
+/*--------------------------------------------------------------------
+c-------------------------------------------------------------------*/
+
+static void zero3(dash::NArray<double, 3> &z, int n1, int n2, int n3) {
+  /*--------------------------------------------------------------------
+  c-------------------------------------------------------------------*/
+  dash::fill(z.begin(), z.end(), 0.0);
+}
+
+/*---- end of program ------------------------------------------------*/
diff --git a/NPB/Makefile b/NPB/Makefile
new file mode 100644
index 0000000..19ee235
--- /dev/null
+++ b/NPB/Makefile
@@ -0,0 +1,71 @@
+SHELL=/bin/sh
+CLASS=W
+SFILE=config/suite.def
+
+default: header
+	@ $(SHELL) sys/print_instructions
+
+BT: bt
+bt: header
+	cd BT; $(MAKE) CLASS=$(CLASS)
+		       
+SP: sp		       
+sp: header	       
+	cd SP; $(MAKE) CLASS=$(CLASS)
+		       
+LU: lu		       
+lu: header	       
+	cd LU; $(MAKE) CLASS=$(CLASS)
+		       
+MG: mg		       
+mg: header	       
+	cd MG; $(MAKE) CLASS=$(CLASS)
+		       
+FT: ft		       
+ft: header	       
+	cd FT; $(MAKE) CLASS=$(CLASS)
+		       
+IS: is		       
+is: header	       
+	cd IS; $(MAKE) CLASS=$(CLASS)
+		       
+CG: cg		       
+cg: header	       
+	cd CG; $(MAKE) CLASS=$(CLASS)
+		       
+EP: ep		       
+ep: header	       
+	cd EP; $(MAKE) CLASS=$(CLASS)
+DC: dc
+dc: header	       
+	cd DC; $(MAKE) CLASS=$(CLASS)
+
+# Awk script courtesy cmg@cray.com
+suite:
+	@ awk '{ if ($$1 !~ /^#/ &&  NF > 0)                              \
+	printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE)  \
+	| $(SHELL)
+
+
+# It would be nice to make clean in each subdirectory (the targets
+# are defined) but on a really clean system this will won't work
+# because those makefiles need config/make.def
+clean:
+	- rm -f core 
+	- rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe
+	- rm -f sys/setparams sys/makesuite sys/setparams.hpp
+
+cleanall: clean
+	- rm -r bin/*
+
+veryclean: clean
+	- rm config/make.def config/suite.def Part*
+	- rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.*
+
+header:
+	@ $(SHELL) sys/print_header
+
+kit: 
+	- makekit -s100k -k30 * */* */*/*
+
+
diff --git a/NPB/README.md b/NPB/README.md
new file mode 100644
index 0000000..9dc45f4
--- /dev/null
+++ b/NPB/README.md
@@ -0,0 +1,79 @@
+To use the C++NPB, adjust the config/make.def and rename the .cpp's. The makefile only accepts ep.cpp, mg.cpp, cg.cpp, is.cpp and ft.cpp.
+
+Below is the original makefile for Griebler et al.
+
+# How to Cite our Work
+	
+D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
+
+# The NPB-CPP Benchmark
+
+These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version.
+
+	==================================================================
+		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
+	 												
+			Code contributors: 
+					Dalvan Griebler    		
+					Júnior Löff
+													
+		Warning: in case of problems send an email to us:					
+			dalvan.griebler@acad.pucrs.br			
+			junior.loff@acad.pucrs.br				
+	==================================================================
+
+
+This folder contains:
+
+	- NPB-FF - Directory with the parallel version implemented in FastFlow
+	- NPB-OMP - Directory with the parallel version translated from the original NPB version
+	- NPB-SER - Directory with the serial version of the NPB ported to C++
+	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
+
+Each directory is independent and contains its own implemented version of the kernels:
+
+	IS - Integer Sort, random memory access
+	EP - Embarrassingly Parallel
+	CG - Conjugate Gradient, irregular memory access and communication
+	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
+	FT - discrete 3D fast Fourier Transform, all-to-all communication
+
+# Software Requiriments
+
+*Warning: our tests were made with GCC-5*
+
+**TBB**
+
+*Installation*
+
+	apt-get install libtbb-dev
+
+**FastFlow** 
+
+*Installation*
+
+	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
+
+
+# How to Compile 
+
+Enter the directory from the version desired and execute:
+
+	make _BENCHMARK CLASS=_VERSION
+
+
+_BENCHMARKs are: 
+		
+	EP, CG, MG, IS and FT 
+																										
+_VERSIONs are: 
+	
+	Class S: small for quick test purposes
+	Class W: workstation size (a 90's workstation; now likely too small)	
+	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
+	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
+
+
+Command:
+
+	make ep CLASS=B
\ No newline at end of file
diff --git a/NPB/common/c_print_results.cpp b/NPB/common/c_print_results.cpp
new file mode 100755
index 0000000..9d4d21c
--- /dev/null
+++ b/NPB/common/c_print_results.cpp
@@ -0,0 +1,70 @@
+/*****************************************************************/
+/******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
+/*****************************************************************/
+#include <cstdlib>
+#include <cstdio>
+
+void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, int  nthreads, double t,
+  double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
+  char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand)
+{
+
+    printf( "\n\n %s Benchmark Completed\n", name ); 
+
+    printf( " class_npb       =                        %c\n", class_npb );
+
+    if( n2 == 0 && n3 == 0 )
+        printf( " Size            =             %12d\n", n1 );   /* as in IS */
+    else
+        printf( " Size            =              %3dx%3dx%3d\n", n1,n2,n3 );
+
+    printf( " Iterations      =             %12d\n", niter );
+    
+    printf( " Threads         =             %12d\n", nthreads );
+ 
+    printf( " Time in seconds =             %12.2f\n", t );
+
+    printf( " Mop/s total     =             %12.2f\n", mops );
+
+    printf( " Operation type  = %24s\n", optype);
+
+    if( passed_verification )
+        printf( " Verification    =               SUCCESSFUL\n" );
+    else
+        printf( " Verification    =             UNSUCCESSFUL\n" );
+
+    printf( " Version         =             %12s\n", npbversion );
+
+    printf( " Compile date    =             %12s\n", compiletime );
+
+    printf( "\n Compile options:\n" );
+
+    printf( "    CC           = %s\n", cc );
+
+    printf( "    CLINK        = %s\n", clink );
+
+    printf( "    C_LIB        = %s\n", c_lib );
+
+    printf( "    C_INC        = %s\n", c_inc );
+
+    printf( "    CFLAGS       = %s\n", cflags );
+
+    printf( "    CLINKFLAGS   = %s\n", clinkflags );
+
+    printf( "    RAND         = %s\n", rand );
+#ifdef SMP
+    char *evalue = getenv("MP_SET_NUMTHREADS");
+    printf( "   MULTICPUS = %s\n", evalue );
+#endif
+
+/*    printf( "\n\n" );
+    printf( " Please send the results of this run to:\n\n" );
+    printf( " NPB Development Team\n" );
+    printf( " Internet: npb@nas.nasa.gov\n \n" );
+    printf( " If email is not available, send this to:\n\n" );
+    printf( " MS T27A-1\n" );
+    printf( " NASA Ames Research Center\n" );
+    printf( " Moffett Field, CA  94035-1000\n\n" );
+    printf( " Fax: 415-604-3957\n\n" );*/
+}
+ 
diff --git a/NPB/common/c_randdp.cpp b/NPB/common/c_randdp.cpp
new file mode 100755
index 0000000..521e562
--- /dev/null
+++ b/NPB/common/c_randdp.cpp
@@ -0,0 +1,137 @@
+/*
+*/
+#if defined(USE_POW)
+#define r23 pow(0.5, 23.0)
+#define r46 (r23*r23)
+#define t23 pow(2.0, 23.0)
+#define t46 (t23*t23)
+#else
+#define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5)
+#define r46 (r23*r23)
+#define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0)
+#define t46 (t23*t23)
+#endif
+
+/*c---------------------------------------------------------------------
+c---------------------------------------------------------------------*/
+
+double randlc (double *x, double a) {
+
+/*c---------------------------------------------------------------------
+c---------------------------------------------------------------------*/
+
+/*c---------------------------------------------------------------------
+c
+c   This routine returns a uniform pseudorandom double precision number in the
+c   range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
+c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+c   the new seed x_1, so that subsequent calls to RANDLC using the same
+c   arguments will generate a continuous sequence.
+c
+c   This routine should produce the same results on any computer with at least
+c   48 mantissa bits in double precision floating point data.  On 64 bit
+c   systems, double precision should be disabled.
+c
+c   David H. Bailey     October 26, 1990
+c
+c---------------------------------------------------------------------*/
+
+    double t1,t2,t3,t4,a1,a2,x1,x2,z;
+
+/*c---------------------------------------------------------------------
+c   Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------*/
+    t1 = r23 * a;
+    a1 = (int)t1;
+    a2 = a - t23 * a1;
+
+/*c---------------------------------------------------------------------
+c   Break X into two parts such that X = 2^23 * X1 + X2, compute
+c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+c   X = 2^23 * Z + A2 * X2  (mod 2^46).
+c---------------------------------------------------------------------*/
+    t1 = r23 * (*x);
+    x1 = (int)t1;
+    x2 = (*x) - t23 * x1;
+    t1 = a1 * x2 + a2 * x1;
+    t2 = (int)(r23 * t1);
+    z = t1 - t23 * t2;
+    t3 = t23 * z + a2 * x2;
+    t4 = (int)(r46 * t3);
+    (*x) = t3 - t46 * t4;
+
+    return (r46 * (*x));
+}
+
+/*c---------------------------------------------------------------------
+c---------------------------------------------------------------------*/
+
+void vranlc (int n, double *x_seed, double a, double y[]) {
+
+/*c---------------------------------------------------------------------
+c---------------------------------------------------------------------*/
+
+/*c---------------------------------------------------------------------
+c
+c   This routine generates N uniform pseudorandom double precision numbers in
+c   the range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The N results are placed in Y and are normalized
+c   to be between 0 and 1.  X is updated to contain the new seed, so that
+c   subsequent calls to VRANLC using the same arguments will generate a
+c   continuous sequence.  If N is zero, only initialization is performed, and
+c   the variables X, A and Y are ignored.
+c
+c   This routine is the standard version designed for scalar or RISC systems.
+c   However, it should produce the same results on any single processor
+c   computer with at least 48 mantissa bits in double precision floating point
+c   data.  On 64 bit systems, double precision should be disabled.
+c
+c---------------------------------------------------------------------*/
+
+    int i;
+    double x,t1,t2,t3,t4,a1,a2,x1,x2,z;
+
+/*c---------------------------------------------------------------------
+c   Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------*/
+    t1 = r23 * a;
+    a1 = (int)t1;
+    a2 = a - t23 * a1;
+    x = *x_seed;
+
+/*c---------------------------------------------------------------------
+c   Generate N results.   This loop is not vectorizable.
+c---------------------------------------------------------------------*/
+    for (i = 1; i <= n; i++) {
+
+/*c---------------------------------------------------------------------
+c   Break X into two parts such that X = 2^23 * X1 + X2, compute
+c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+c   X = 2^23 * Z + A2 * X2  (mod 2^46).
+c---------------------------------------------------------------------*/
+        t1 = r23 * x;
+        x1 = (int)t1;
+        x2 = x - t23 * x1;
+        t1 = a1 * x2 + a2 * x1;
+        t2 = (int)(r23 * t1);
+        z = t1 - t23 * t2;
+        t3 = t23 * z + a2 * x2;
+        t4 = (int)(r46 * t3);
+        x = t3 - t46 * t4;
+        y[i] = r46 * x;
+    }
+    *x_seed = x;
+}
\ No newline at end of file
diff --git a/NPB/common/c_timers.cpp b/NPB/common/c_timers.cpp
new file mode 100755
index 0000000..f38f92c
--- /dev/null
+++ b/NPB/common/c_timers.cpp
@@ -0,0 +1,65 @@
+
+
+
+#include "wtime.hpp"
+#include <cstdlib>
+
+/*  Prototype  */
+void wtime( double * );
+
+
+
+/*****************************************************************/
+/******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
+/*****************************************************************/
+double elapsed_time( void )
+{
+    double t;
+
+    wtime( &t );
+    return( t );
+}
+
+
+double start[64], elapsed[64];
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  C  L  E  A  R          ******/
+/*****************************************************************/
+void timer_clear( int n )
+{
+    elapsed[n] = 0.0;
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  S  T  A  R  T          ******/
+/*****************************************************************/
+void timer_start( int n )
+{
+    start[n] = elapsed_time();
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  S  T  O  P             ******/
+/*****************************************************************/
+void timer_stop( int n )
+{
+    double t, now;
+
+    now = elapsed_time();
+    t = now - start[n];
+    elapsed[n] += t;
+
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  R  E  A  D             ******/
+/*****************************************************************/
+double timer_read( int n )
+{
+    return( elapsed[n] );
+}
+
diff --git a/NPB/common/npb-CPP.hpp b/NPB/common/npb-CPP.hpp
new file mode 100755
index 0000000..07181e9
--- /dev/null
+++ b/NPB/common/npb-CPP.hpp
@@ -0,0 +1,52 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif /* _OPENMP */
+
+typedef int boolean;
+typedef struct { double real; double imag; } dcomplex;
+
+#define TRUE	1
+#define FALSE	0
+
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#define	pow2(a) ((a)*(a))
+
+#define get_real(c) c.real
+#define get_imag(c) c.imag
+#define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag)
+#define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag)
+#define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \
+                    c.imag = a.real * b.imag + a.imag * b.real)
+#define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b)
+
+dcomplex cmulret(dcomplex a, dcomplex b) {
+  dcomplex res;
+  res.real = a.real * b.real - a.imag * b.imag;
+  res.imag = a.real * b.imag + a.imag * b.real;
+  return res;
+}
+
+dcomplex crmulret(dcomplex a, double b) {
+  dcomplex res;
+  res.real = a.real * b;
+  res.imag = a.imag * b;
+  return res;
+}
+
+extern double randlc(double *, double);
+extern void vranlc(int, double *, double, double *);
+extern void timer_clear(int);
+extern void timer_start(int);
+extern void timer_stop(int);
+extern double timer_read(int);
+
+extern void c_print_results(char *name, char class_npb, int n1, int n2,
+			    int n3, int niter, int nthreads, double t,
+			    double mops, char *optype, int passed_verification,
+			    char *npbversion, char *compiletime, char *cc,
+			    char *clink, char *c_lib, char *c_inc,
+			    char *cflags, char *clinkflags, char *rand);
diff --git a/NPB/common/util.h b/NPB/common/util.h
new file mode 100755
index 0000000..b47c1b4
--- /dev/null
+++ b/NPB/common/util.h
@@ -0,0 +1,16 @@
+
+#ifndef UTIL_H_INCLUDED
+#define UTIL_H_INCLUDED
+
+#include <time.h>
+
+#define TIMESTAMP(time_) 						\
+  do {									\
+      struct timespec ts;						\
+      clock_gettime(CLOCK_MONOTONIC, &ts);				\
+      time_=((double)ts.tv_sec)+(1.0e-9)*((double)ts.tv_nsec);		\
+  } while(0)
+
+
+#endif /* UTIL_H_INCLUDED */
+
diff --git a/NPB/common/wtime.cpp b/NPB/common/wtime.cpp
new file mode 100755
index 0000000..33f2fe6
--- /dev/null
+++ b/NPB/common/wtime.cpp
@@ -0,0 +1,13 @@
+#include "wtime.hpp"
+#include <sys/time.h>
+
+void wtime(double *t)
+{
+  static int sec = -1;
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+  if (sec < 0) sec = tv.tv_sec;
+  *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
+}
+
+    
diff --git a/NPB/common/wtime.hpp b/NPB/common/wtime.hpp
new file mode 100755
index 0000000..12eb0cb
--- /dev/null
+++ b/NPB/common/wtime.hpp
@@ -0,0 +1,12 @@
+/* C/Fortran interface is different on different machines. 
+ * You may need to tweak this.
+ */
+
+
+#if defined(IBM)
+#define wtime wtime
+#elif defined(CRAY)
+#define wtime WTIME
+#else
+#define wtime wtime_
+#endif
diff --git a/NPB/common/wtime_sgi64.cpp b/NPB/common/wtime_sgi64.cpp
new file mode 100755
index 0000000..6c810d2
--- /dev/null
+++ b/NPB/common/wtime_sgi64.cpp
@@ -0,0 +1,74 @@
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syssgi.h>
+#include <sys/immu.h>
+#include <cerrno>
+#include <cstdio>
+
+/* The following works on SGI Power Challenge systems */
+
+typedef unsigned long iotimer_t;
+
+unsigned int cycleval;
+volatile iotimer_t *iotimer_addr, base_counter;
+double resolution;
+
+/* address_t is an integer type big enough to hold an address */
+typedef unsigned long address_t;
+
+
+
+void timer_init() 
+{
+  
+  int fd;
+  char *virt_addr;
+  address_t phys_addr, page_offset, pagemask, pagebase_addr;
+  
+  pagemask = getpagesize() - 1;
+  errno = 0;
+  phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
+  if (errno != 0) {
+    perror("SGI_QUERY_CYCLECNTR");
+    exit(1);
+  }
+  /* rel_addr = page offset of physical address */
+  page_offset = phys_addr & pagemask;
+  pagebase_addr = phys_addr - page_offset;
+  fd = open("/dev/mmem", O_RDONLY);
+
+  virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
+  virt_addr = virt_addr + page_offset;
+  iotimer_addr = (iotimer_t *)virt_addr;
+  /* cycleval in picoseconds to this gives resolution in seconds */
+  resolution = 1.0e-12*cycleval; 
+  base_counter = *iotimer_addr;
+}
+
+void wtime_(double *time) 
+{
+  static int initialized = 0;
+  volatile iotimer_t counter_value;
+  if (!initialized) { 
+    timer_init();
+    initialized = 1;
+  }
+  counter_value = *iotimer_addr - base_counter;
+  *time = (double)counter_value * resolution;
+}
+
+
+void wtime(double *time) 
+{
+  static int initialized = 0;
+  volatile iotimer_t counter_value;
+  if (!initialized) { 
+    timer_init();
+    initialized = 1;
+  }
+  counter_value = *iotimer_addr - base_counter;
+  *time = (double)counter_value * resolution;
+}
+
+
diff --git a/NPB/config/make.def b/NPB/config/make.def
new file mode 100755
index 0000000..63cd602
--- /dev/null
+++ b/NPB/config/make.def
@@ -0,0 +1,101 @@
+#---------------------------------------------------------------------------
+#
+#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+# (Note these definitions are inconsistent with NPB2.1.)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# CC         - C compiler
+# CFLAGS     - C compilation arguments
+# C_INC      - any -I arguments required for compiling C
+# CLINK      - C linker
+# CLINKFLAGS - C linker flags
+# C_LIB      - any -L and -l arguments required for linking C
+#
+# compilations are done with $(CC) $(C_INC) $(CFLAGS) or
+#                            $(CC) $(CFLAGS)
+# linking is done with       $(CLINK) $(C_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for DASH programs
+#---------------------------------------------------------------------------
+CC = dash-mpicxx
+#gcc #cc
+# This links C programs; usually the same as ${CC}
+CLINK	= $(CC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker
+#---------------------------------------------------------------------------
+C_LIB = -L$(HOME)/opt/dash-0.4.0/lib
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler
+#---------------------------------------------------------------------------
+C_INC = -I../common -I$(HOME)/opt/dash-0.4.0/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs (use -w3 -diag-disable 981 -diag-disable 383)
+#---------------------------------------------------------------------------
+CFLAGS	= -std=c++14 -O3
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here. (use -w3 -diag-disable 981 -diag-disable 383)
+#---------------------------------------------------------------------------
+CLINKFLAGS = -lm -ldash-mpi -ldart-mpi -ldart-base -lhwloc -lnuma -lpthread
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities.  Flags required by
+# this compiler go here also; typically there are few flags required; hence
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+UCC		= gcc
+#UCC	= cc
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. .
+#---------------------------------------------------------------------------
+BINDIR	= ../bin
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator
+# is used. It is described in detail in Doc/README.install.
+# Use "randi8" unless there is a reason to use another one.
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+# RAND   = randi8
+# The following is highly reliable but may be slow:
+RAND   = randdp
+
+
+#---------------------------------------------------------------------------
+# The variable WTIME is the name of the wtime source code module in the
+# NPB2.x/common directory.
+# For most machines,       use wtime.c
+# For SGI power challenge: use wtime_sgi64.c
+#---------------------------------------------------------------------------
+WTIME  = wtime.cpp
+
+
+#---------------------------------------------------------------------------
+# Enable if either Cray or IBM:
+# (no such flag for most machines: see common/wtime.h)
+# This is used by the C compiler to pass the machine name to common/wtime.h,
+# where the C/Fortran binding interface format is determined
+#---------------------------------------------------------------------------
+# MACHINE	=	-DCRAY
+# MACHINE	=	-DIBM
diff --git a/NPB/config/suite.def b/NPB/config/suite.def
new file mode 100755
index 0000000..6b9be9d
--- /dev/null
+++ b/NPB/config/suite.def
@@ -0,0 +1,14 @@
+# config/suite.def
+# This file is used to build several benchmarks with a single command.
+# Typing "make suite" in the main directory will build all the benchmarks
+# specified in this file.
+# Each line of this file contains a benchmark name, class, and number
+# of nodes. The name is one of "cg", "is", "ep", mg", "ft"
+# The class is one of "S", "W", "A", "B", and "C".
+# No blank lines.
+# The following example builds serial sample sizes of all benchmarks.
+ft	B
+mg	B
+is	B
+ep	B
+cg	B
diff --git a/NPB/sys/Makefile b/NPB/sys/Makefile
new file mode 100644
index 0000000..829fc85
--- /dev/null
+++ b/NPB/sys/Makefile
@@ -0,0 +1,21 @@
+include ../config/make.def
+
+# Note that COMPILE is also defined in make.common and should
+# be the same. We can't include make.common because it has a lot
+# of other garbage. 
+FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
+
+all: setparams 
+
+# setparams creates an npbparam.h file for each benchmark 
+# configuration. npbparams.h also contains info about how a benchmark
+# was compiled and linked
+
+setparams: setparams.cpp ../config/make.def
+	$(UCC) -o setparams setparams.cpp
+
+
+clean: 
+	-rm -f setparams setparams.hpp npbparams.hpp
+	-rm -f *~ *.o
+
diff --git a/NPB/sys/README b/NPB/sys/README
new file mode 100644
index 0000000..ede69b5
--- /dev/null
+++ b/NPB/sys/README
@@ -0,0 +1,41 @@
+This directory contains utilities and files used by the 
+build process. You should not need to change anything
+in this directory. 
+
+Original Files
+--------------
+setparams.c:
+        Source for the setparams program. This program is used internally
+        in the build process to create the file "npbparams.h" for each 
+        benchmark. npbparams.h contains Fortran or C parameters to build a 
+        benchmark for a specific class. The setparams program is never run 
+        directly by a user. Its invocation syntax is 
+
+            "setparams benchmark-name class". 
+
+        It examines the file "npbparams.h" in the current directory. If 
+        the specified parameters are the same as those in the npbparams.h 
+        file, nothing it changed. If the file does not exist or corresponds 
+        to a different class/number of nodes, it is (re)built. 
+	One of the more complicated things in npbparams.h is that it 
+        contains, in a Fortran string, the compiler flags used to build a 
+        benchmark, so that a benchmark can print out how it was compiled. 
+
+make.common
+        A makefile segment that is included in each individual benchmark
+        program makefile. It sets up some standard macros (COMPILE, etc) 
+        and makes sure everything is configured correctly (npbparams.h)
+
+Makefile
+        Builds  setparams
+
+README
+        This file. 
+
+
+Created files
+-------------
+
+setparams
+	See descriptions above
+
diff --git a/NPB/sys/make.common b/NPB/sys/make.common
new file mode 100644
index 0000000..abb3a28
--- /dev/null
+++ b/NPB/sys/make.common
@@ -0,0 +1,65 @@
+PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
+FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
+CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
+
+# Class "U" is used internally by the setparams program to mean
+# "unknown". This means that if you don't specify CLASS=
+# on the command line, you'll get an error. It would be nice
+# to be able to avoid this, but we'd have to get information
+# from the setparams back to the make program, which isn't easy. 
+CLASS=U
+
+default:: ${PROGRAM}
+
+# This makes sure the configuration utility setparams 
+# is up to date. 
+# Note that this must be run every time, which is why the
+# target does not exist and is not created. 
+# If you create a file called "config" you will break things. 
+config:
+	@cd ../sys; ${MAKE} all
+	../sys/setparams ${BENCHMARK} ${CLASS}
+
+COMMON=../common
+${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
+	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
+
+${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp
+	cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp
+
+${COMMON}/print_results.o: ${COMMON}/print_results.f
+	cd ${COMMON}; ${FCOMPILE} print_results.f
+
+${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp
+	cd ${COMMON}; ${CCOMPILE} c_print_results.cpp
+
+${COMMON}/timers.o: ${COMMON}/timers.f
+	cd ${COMMON}; ${FCOMPILE} timers.f
+
+${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp
+	cd ${COMMON}; ${CCOMPILE} c_timers.cpp
+
+${COMMON}/wtime.o: ${COMMON}/${WTIME}
+	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME}
+# For most machines or CRAY or IBM
+#	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
+# For a precise timer on an SGI Power Challenge, try:
+#	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
+
+${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
+	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME}
+
+
+# Normally setparams updates npbparams.h only if the settings (CLASS)
+# have changed. However, we also want to update if the compile options
+# may have changed (set in ../config/make.def). 
+npbparams.hpp: ../config/make.def
+	@ echo make.def modified. Rebuilding npbparams.hpp just in case
+	rm -f npbparams.hpp
+	../sys/setparams ${BENCHMARK} ${CLASS}
+
+# So that "make benchmark-name" works
+${BENCHMARK}:  default
+${BENCHMARKU}: default
+
+
diff --git a/NPB/sys/print_header b/NPB/sys/print_header
new file mode 100644
index 0000000..7765ebd
--- /dev/null
+++ b/NPB/sys/print_header
@@ -0,0 +1,15 @@
+echo ''
+echo '   ========================================='
+echo '   =      NAS Parallel Benchmarks          ='
+echo '   =      C++ DASH Versions                ='
+echo '   =      Developed by:    Dalvan Griebler ='
+echo '   =                       Júnior Löff     ='
+echo '   =      DASH version by: Nicco Mietzsch  ='
+echo '   =                                       ='
+echo '   =      Warning: in case of problems     ='
+echo '   =      send an email to us:             ='
+echo '   =      dalvan.griebler@acad.pucrs.br    ='
+echo '   =      junior.loff@acad.pucrs.br        ='
+echo '   =      nicco.mietzsch@campus.lmu.de     ='
+echo '   ========================================='
+echo ''
diff --git a/NPB/sys/print_instructions b/NPB/sys/print_instructions
new file mode 100644
index 0000000..bc6221f
--- /dev/null
+++ b/NPB/sys/print_instructions
@@ -0,0 +1,18 @@
+echo ''
+echo '   To make a NAS benchmark type '
+echo ''
+echo '         make <benchmark-name> CLASS=<class>'
+echo ''
+echo '   where <benchmark-name> is "cg", "ep", "ft", "is", or "mg"'
+echo '         <class>          is "S", "W", "A", "B" or "C"'
+echo ''
+echo '   To make a set of benchmarks, create the file config/suite.def'
+echo '   according to the instructions in config/suite.def.template and type'
+echo ''
+echo '         make suite'
+echo ''
+echo ' ***************************************************************'
+echo ' * Remember to edit the file config/make.def for site specific *'
+echo ' * information as described in the README file                 *'
+echo ' ***************************************************************'
+
diff --git a/NPB/sys/setparams.cpp b/NPB/sys/setparams.cpp
new file mode 100644
index 0000000..107ca57
--- /dev/null
+++ b/NPB/sys/setparams.cpp
@@ -0,0 +1,896 @@
+/*
+ * This utility configures a NPB to be built for a specific class.
+ * It creates a file "npbparams.h"
+ * in the source directory. This file keeps state information about
+ * which size of benchmark is currently being built (so that nothing
+ * if unnecessarily rebuilt) and defines (through PARAMETER statements)
+ * the number of nodes and class for which a benchmark is being built.
+
+ * The utility takes 3 arguments:
+ *       setparams benchmark-name class
+ *    benchmark-name is "sp", "bt", etc
+ *    class is the size of the benchmark
+ * These parameters are checked for the current benchmark. If they
+ * are invalid, this program prints a message and aborts.
+ * If the parameters are ok, the current npbsize.h (actually just
+ * the first line) is read in. If the new parameters are the same as
+ * the old, nothing is done, but an exit code is returned to force the
+ * user to specify (otherwise the make procedure succeeds but builds a
+ * binary of the wrong name).  Otherwise the file is rewritten.
+ * Errors write a message (to stdout) and abort.
+ *
+ * This program makes use of two extra benchmark "classes"
+ * class "X" means an invalid specification. It is returned if
+ * there is an error parsing the config file.
+ * class "U" is an external specification meaning "unknown class"
+ *
+ * Unfortunately everything has to be case sensitive. This is
+ * because we can always convert lower to upper or v.v. but
+ * can't feed this information back to the makefile, so typing
+ * make CLASS=a and make CLASS=A will produce different binaries.
+ *
+ *
+ */
+
+#include <sys/types.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cctype>
+#include <cstring>
+#include <ctime>
+
+/*
+ * This is the master version number for this set of
+ * NPB benchmarks. It is in an obscure place so people
+ * won't accidentally change it.
+ */
+
+#define VERSION "4.0"
+
+/* controls verbose output from setparams */
+/* #define VERBOSE */
+
+#define FILENAME "npbparams.hpp"
+#define DESC_LINE "/* CLASS = %c */\n"
+#define DEF_CLASS_LINE     "#define CLASS '%c'\n"
+#define FINDENT  "        "
+#define CONTINUE "     > "
+
+void get_info(char *argv[], int *typep, char *classp);
+void check_info(int type, char class_npb);
+void read_info(int type, char *classp);
+void write_info(int type, char class_npb);
+void write_sp_info(FILE *fp, char class_npb);
+void write_bt_info(FILE *fp, char class_npb);
+void write_dc_info(FILE *fp, char class_npb);
+void write_lu_info(FILE *fp, char class_npb);
+void write_mg_info(FILE *fp, char class_npb);
+void write_cg_info(FILE *fp, char class_npb);
+void write_ft_info(FILE *fp, char class_npb);
+void write_ep_info(FILE *fp, char class_npb);
+void write_is_info(FILE *fp, char class_npb);
+void write_compiler_info(int type, FILE *fp);
+void write_convertdouble_info(int type, FILE *fp);
+void check_line(char *line, char *label, char *val);
+int  check_include_line(char *line, char *filename);
+void put_string(FILE *fp, char *name, char *val);
+void put_def_string(FILE *fp, char *name, char *val);
+void put_def_variable(FILE *fp, char *name, char *val);
+int ilog2(int i);
+
+enum benchmark_types {SP, BT, LU, MG, FT, IS, EP, CG, DC};
+
+main(int argc, char *argv[])
+{
+  int type;
+  char class_npb, class_old;
+
+  if (argc != 3) {
+    printf("Usage: %s benchmark-name class_npb\n", argv[0]);
+    exit(1);
+  }
+
+  /* Get command line arguments. Make sure they're ok. */
+  get_info(argv, &type, &class_npb);
+  if (class_npb != 'U') {
+#ifdef VERBOSE
+    printf("setparams: For benchmark %s: class_npb = %c\n",
+	   argv[1], class_npb);
+#endif
+    check_info(type, class_npb);
+  }
+
+  /* Get old information. */
+  read_info(type, &class_old);
+  if (class_npb != 'U') {
+    if (class_old != 'X') {
+#ifdef VERBOSE
+      printf("setparams:     old settings: class_npb = %c\n",
+	     class_old);
+#endif
+    }
+  } else {
+    printf("setparams:\n\
+  *********************************************************************\n\
+  * You must specify CLASS to build this benchmark                    *\n\
+  * For example, to build a class A benchmark, type                   *\n\
+  *       make {benchmark-name} CLASS=A                               *\n\
+  *********************************************************************\n\n");
+
+    if (class_old != 'X') {
+#ifdef VERBOSE
+      printf("setparams: Previous settings were CLASS=%c \n", class_old);
+#endif
+    }
+    exit(1); /* exit on class==U */
+  }
+
+  /* Write out new information if it's different. */
+  if (class_npb != class_old) {
+#ifdef VERBOSE
+    printf("setparams: Writing %s\n", FILENAME);
+#endif
+    write_info(type, class_npb);
+  } else {
+#ifdef VERBOSE
+    printf("setparams: Settings unchanged. %s unmodified\n", FILENAME);
+#endif
+  }
+
+  exit(0);
+}
+
+
+/*
+ *  get_info(): Get parameters from command line
+ */
+
+void get_info(char *argv[], int *typep, char *classp)
+{
+
+  *classp = *argv[2];
+
+  if      (!strcmp(argv[1], "sp") || !strcmp(argv[1], "SP")) *typep = SP;
+  else if (!strcmp(argv[1], "bt") || !strcmp(argv[1], "BT")) *typep = BT;
+  else if (!strcmp(argv[1], "ft") || !strcmp(argv[1], "FT")) *typep = FT;
+  else if (!strcmp(argv[1], "lu") || !strcmp(argv[1], "LU")) *typep = LU;
+  else if (!strcmp(argv[1], "mg") || !strcmp(argv[1], "MG")) *typep = MG;
+  else if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS;
+  else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP;
+  else if (!strcmp(argv[1], "cg") || !strcmp(argv[1], "CG")) *typep = CG;
+  else if (!strcmp(argv[1], "dc") || !strcmp(argv[1], "DC")) *typep = DC;
+  else {
+    printf("setparams: Error: unknown benchmark type %s\n", argv[1]);
+    exit(1);
+  }
+}
+
+/*
+ *  check_info(): Make sure command line data is ok for this benchmark
+ */
+
+void check_info(int type, char class_npb)
+{
+  int tmplog;
+
+  /* check class_npb */
+  if (class_npb != 'S' &&
+      class_npb != 'A' &&
+      class_npb != 'B' &&
+      class_npb != 'R' &&
+      class_npb != 'W' &&
+      class_npb != 'C') {
+    printf("setparams: Unknown benchmark class_npb %c\n", class_npb);
+    printf("setparams: Allowed classes are \"S\", \"A\", \"B\" and \"C\"\n");
+    exit(1);
+  }
+
+}
+
+
+/*
+ * read_info(): Read previous information from file.
+ *              Not an error if file doesn't exist, because this
+ *              may be the first time we're running.
+ *              Assumes the first line of the file is in a special
+ *              format that we understand (since we wrote it).
+ */
+
+void read_info(int type, char *classp)
+{
+  int nread, gotem = 0;
+  char line[200];
+  FILE *fp;
+  fp = fopen(FILENAME, "r");
+  if (fp == NULL) {
+#ifdef VERBOSE
+    printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME);
+#endif
+    goto abort;
+  }
+
+  /* first line of file contains info (fortran), first two lines (C) */
+
+  switch(type) {
+      case SP:
+      case BT:
+      case FT:
+      case MG:
+      case LU:
+      case EP:
+      case CG:
+          nread = fscanf(fp, DESC_LINE, classp);
+          if (nread != 1) {
+            printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME);
+            goto abort;
+          }
+          break;
+      case IS:
+      case DC:
+          nread = fscanf(fp, DEF_CLASS_LINE, classp);
+          if (nread != 1) {
+            printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME);
+            goto abort;
+          }
+          break;
+      default:
+        /* never should have gotten this far with a bad name */
+        printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
+        exit(1);
+  }
+
+ normal_return:
+  *classp = *classp;
+  fclose(fp);
+
+
+  return;
+
+ abort:
+  *classp = 'X';
+  return;
+}
+
+
+/*
+ * write_info(): Write new information to config file.
+ *               First line is in a special format so we can read
+ *               it in again. Then comes a warning. The rest is all
+ *               specific to a particular benchmark.
+ */
+
+void write_info(int type, char class_npb)
+{
+  FILE *fp;
+  fp = fopen(FILENAME, "w");
+  if (fp == NULL) {
+    printf("setparams: Can't open file %s for writing\n", FILENAME);
+    exit(1);
+  }
+
+  switch(type) {
+      case SP:
+      case BT:
+      case FT:
+      case MG:
+      case LU:
+      case EP:
+      case CG:
+          /* Write out the header */
+          fprintf(fp, DESC_LINE, class_npb);
+          /* Print out a warning so bozos don't mess with the file */
+          fprintf(fp, "\
+/*\n\
+c  This file is generated automatically by the setparams utility.\n\
+c  It sets the number of processors and the class_npb of the NPB\n\
+c  in this directory. Do not modify it by hand.\n\
+*/\n");
+
+          break;
+      case IS:
+          fprintf(fp, DEF_CLASS_LINE, class_npb);
+          fprintf(fp, "\
+/*\n\
+   This file is generated automatically by the setparams utility.\n\
+   It sets the number of processors and the class of the NPB\n\
+   in this directory. Do not modify it by hand.   */\n\
+   \n");
+          break;
+       case DC:
+          fprintf(fp, DEF_CLASS_LINE, class_npb);
+          fprintf(fp, "\
+/*\n\
+   This file is generated automatically by the setparams utility.\n\
+   It sets the number of processors and the class of the NPB\n\
+   in this directory. Do not modify it by hand.\n\
+   This file provided for backward compatibility.\n\
+   It is not used in DC benchmark.   */\n\
+   \n");
+          break;
+      default:
+          printf("setparams: (Internal error): Unknown benchmark type %d\n",
+                                                                         type);
+          exit(1);
+  }
+
+  /* Now do benchmark-specific stuff */
+  switch(type) {
+  case SP:
+    write_sp_info(fp, class_npb);
+    break;
+  case BT:
+    write_bt_info(fp, class_npb);
+    break;
+  case DC:
+    write_dc_info(fp, class_npb);
+    break;
+  case LU:
+    write_lu_info(fp, class_npb);
+    break;
+  case MG:
+    write_mg_info(fp, class_npb);
+    break;
+  case IS:
+    write_is_info(fp, class_npb);
+    break;
+  case FT:
+    write_ft_info(fp, class_npb);
+    break;
+  case EP:
+    write_ep_info(fp, class_npb);
+    break;
+  case CG:
+    write_cg_info(fp, class_npb);
+    break;
+  default:
+    printf("setparams: (Internal error): Unknown benchmark type %d\n", type);
+    exit(1);
+  }
+  write_convertdouble_info(type, fp);
+  write_compiler_info(type, fp);
+  fclose(fp);
+  return;
+}
+
+
+/*
+ * write_sp_info(): Write SP specific info to config file
+ */
+
+void write_sp_info(FILE *fp, char class_npb)
+{
+  int problem_size, niter;
+  const char *dt;
+  if      (class_npb == 'S') { problem_size = 12;  dt = "0.015";   niter = 100; }
+  else if (class_npb == 'W') { problem_size = 36;  dt = "0.0015";  niter = 400; }
+  else if (class_npb == 'A') { problem_size = 64;  dt = "0.0015";  niter = 400; }
+  else if (class_npb == 'B') { problem_size = 102; dt = "0.001";   niter = 400; }
+  else if (class_npb == 'C') { problem_size = 162; dt = "0.00067"; niter = 400; }
+  else {
+    printf("setparams: Internal error: invalid class_npb %c\n", class_npb);
+    exit(1);
+  }
+  fprintf(fp, "#define\tPROBLEM_SIZE\t%d\n", problem_size);
+  fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter);
+  fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt);
+}
+
+/*
+ * write_bt_info(): Write BT specific info to config file
+ */
+
+void write_bt_info(FILE *fp, char class_npb)
+{
+  int problem_size, niter;
+  const char *dt;
+  if      (class_npb == 'S') { problem_size = 12;  dt = "0.010";   niter = 60; }
+  else if (class_npb == 'W') { problem_size = 24;  dt = "0.0008";  niter = 200; }
+  else if (class_npb == 'A') { problem_size = 64;  dt = "0.0008";  niter = 200; }
+  else if (class_npb == 'B') { problem_size = 102; dt = "0.0003";  niter = 200; }
+  else if (class_npb == 'C') { problem_size = 162; dt = "0.0001";  niter = 200; }
+  else {
+    printf("setparams: Internal error: invalid class_npb %c\n", class_npb);
+    exit(1);
+  }
+  fprintf(fp, "#define\tPROBLEM_SIZE\t%d\n", problem_size);
+  fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter);
+  fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt);
+}
+
+/*
+ * write_dc_info(): Write DC specific info to config file
+ */
+
+
+void write_dc_info(FILE *fp, char class_npb)
+{
+  long int input_tuples, attrnum;
+  if      (class_npb == 'S') { input_tuples = 1000;     attrnum = 5; }
+  else if (class_npb == 'W') { input_tuples = 100000;   attrnum = 10; }
+  else if (class_npb == 'A') { input_tuples = 1000000;  attrnum = 15; }
+  else if (class_npb == 'B') { input_tuples = 10000000; attrnum = 20; }
+  else {
+    printf("setparams: Internal error: invalid class_npb %c\n", class_npb);
+    exit(1);
+  }
+  fprintf(fp, "long long int input_tuples=%ld, attrnum=%ld;\n",
+              input_tuples, attrnum);
+}
+
+/*
+ * write_lu_info(): Write SP specific info to config file
+ */
+
+void write_lu_info(FILE *fp, char class_npb)
+{
+  int isiz1, isiz2, itmax, inorm, problem_size;
+  int xdiv, ydiv; /* number of cells in x and y direction */
+  const char *dt_default;
+
+  if      (class_npb == 'S') { problem_size = 12;  dt_default = "0.5"; itmax = 50; }
+  else if (class_npb == 'W') { problem_size = 33;  dt_default = "1.5e-3"; itmax = 300; }
+  else if (class_npb == 'A') { problem_size = 64;  dt_default = "2.0"; itmax = 250; }
+  else if (class_npb == 'B') { problem_size = 102; dt_default = "2.0"; itmax = 250; }
+  else if (class_npb == 'C') { problem_size = 162; dt_default = "2.0"; itmax = 250; }
+  else {
+    printf("setparams: Internal error: invalid class_npb %c\n", class_npb);
+    exit(1);
+  }
+  inorm = itmax;
+  isiz1 = problem_size;
+  isiz2 = problem_size;
+
+  fprintf(fp, "\n/* full problem size */\n");
+  fprintf(fp, "#define\tISIZ1\t%d\n", problem_size);
+  fprintf(fp, "#define\tISIZ2\t%d\n", problem_size);
+  fprintf(fp, "#define\tISIZ3\t%d\n", problem_size);
+
+  fprintf(fp, "/* number of iterations and how often to print the norm */\n");
+  fprintf(fp, "#define\tITMAX_DEFAULT\t%d\n", itmax);
+  fprintf(fp, "#define\tINORM_DEFAULT\t%d\n", inorm);
+
+  fprintf(fp, "#define\tDT_DEFAULT\t%s\n", dt_default);
+}
+
+/*
+ * write_mg_info(): Write MG specific info to config file
+ */
+
+void write_mg_info(FILE *fp, char class_npb)
+{
+  int problem_size, nit, log2_size, lt_default, lm;
+  int ndim1, ndim2, ndim3;
+  if      (class_npb == 'S') { problem_size = 32; nit = 4; }
+  else if (class_npb == 'W') { problem_size = 64; nit = 40; }
+  else if (class_npb == 'A') { problem_size = 256; nit = 4; }
+  else if (class_npb == 'B') { problem_size = 256; nit = 20; }
+  else if (class_npb == 'C') { problem_size = 512; nit = 20; }
+  else {
+    printf("setparams: Internal error: invalid class_npb type %c\n", class_npb);
+    exit(1);
+  }
+  log2_size = ilog2(problem_size);
+  /* lt is log of largest total dimension */
+  lt_default = log2_size;
+  /* log of log of maximum dimension on a node */
+  lm = log2_size;
+  ndim1 = lm;
+  ndim3 = log2_size;
+  ndim2 = log2_size;
+
+  fprintf(fp, "#define\tNX_DEFAULT\t%d\n", problem_size);
+  fprintf(fp, "#define\tNY_DEFAULT\t%d\n", problem_size);
+  fprintf(fp, "#define\tNZ_DEFAULT\t%d\n", problem_size);
+  fprintf(fp, "#define\tNIT_DEFAULT\t%d\n", nit);
+  fprintf(fp, "#define\tLM\t%d\n", lm);
+  fprintf(fp, "#define\tLT_DEFAULT\t%d\n", lt_default);
+  fprintf(fp, "#define\tDEBUG_DEFAULT\t%d\n", 0);
+  fprintf(fp, "#define\tNDIM1\t%d\n", ndim1);
+  fprintf(fp, "#define\tNDIM2\t%d\n", ndim2);
+  fprintf(fp, "#define\tNDIM3\t%d\n", ndim3);
+}
+
+
+/*
+ * write_is_info(): Write IS specific info to config file
+ */
+
+void write_is_info(FILE *fp, char class_npb)
+{
+  int m1, m2, m3 ;
+  if( class_npb != 'S' &&
+      class_npb != 'W' &&
+      class_npb != 'A' &&
+      class_npb != 'B' &&
+      class_npb != 'C')
+  {
+    printf("setparams: Internal error: invalid class_npb type %c\n", class_npb);
+    exit(1);
+  }
+}
+
+
+/*
+ * write_cg_info(): Write CG specific info to config file
+ */
+
+void write_cg_info(FILE *fp, char class_npb)
+{
+  int na,nonzer,niter;
+  const char *shift,*rcond="1.0e-1";
+  const char *shiftS="10.0",
+       *shiftW="12.0",
+       *shiftA="20.0",
+       *shiftB="60.0",
+       *shiftC="110.0";
+
+
+  if( class_npb == 'S' )
+  { na=1400; nonzer=7; niter=15; shift=shiftS; }
+  else if( class_npb == 'W' )
+  { na=7000; nonzer=8; niter=15; shift=shiftW; }
+  else if( class_npb == 'A' )
+  { na=14000; nonzer=11; niter=15; shift=shiftA; }
+  else if( class_npb == 'B' )
+  { na=75000; nonzer=13; niter=75; shift=shiftB; }
+  else if( class_npb == 'C' )
+  { na=150000; nonzer=15; niter=75; shift=shiftC; }
+  else
+  {
+    printf("setparams: Internal error: invalid class_npb type %c\n", class_npb);
+    exit(1);
+  }
+  fprintf( fp, "#define\tNA\t%d\n", na);
+  fprintf( fp, "#define\tNONZER\t%d\n", nonzer);
+  fprintf( fp, "#define\tNITER\t%d\n", niter);
+  fprintf( fp, "#define\tSHIFT\t%s\n", shift);
+  fprintf( fp, "#define\tRCOND\t%s\n", rcond);
+}
+
+
+/*
+ * write_ft_info(): Write FT specific info to config file
+ */
+
+void write_ft_info(FILE *fp, char class_npb)
+{
+  /* easiest way (given the way the benchmark is written)
+   * is to specify log of number of grid points in each
+   * direction m1, m2, m3. nt is the number of iterations
+   */
+  int nx, ny, nz, maxdim, niter, np_min;
+  if      (class_npb == 'S') { nx = 64; ny = 64; nz = 64; niter = 6;}
+  else if (class_npb == 'W') { nx = 128; ny = 128; nz = 32; niter = 6;}
+  else if (class_npb == 'A') { nx = 256; ny = 256; nz = 128; niter = 6;}
+  else if (class_npb == 'B') { nx = 512; ny = 256; nz = 256; niter =20;}
+  else if (class_npb == 'C') { nx = 512; ny = 512; nz = 512; niter =20;}
+  else {
+    printf("setparams: Internal error: invalid class_npb type %c\n", class_npb);
+    exit(1);
+  }
+  maxdim = nx;
+  if (ny > maxdim) maxdim = ny;
+  if (nz > maxdim) maxdim = nz;
+  fprintf(fp, "#define\tNX\t%d\n", nx);
+  fprintf(fp, "#define\tNY\t%d\n", ny);
+  fprintf(fp, "#define\tNZ\t%d\n", nz);
+  fprintf(fp, "#define\tMAXDIM\t%d\n", maxdim);
+  fprintf(fp, "#define\tNITER_DEFAULT\t%d\n", niter);
+  fprintf(fp, "#define\tNTOTAL\t%d\n", nx*ny*nz);
+}
+
+/*
+ * write_ep_info(): Write EP specific info to config file
+ */
+
+void write_ep_info(FILE *fp, char class_npb)
+{
+  /* easiest way (given the way the benchmark is written)
+   * is to specify log of number of grid points in each
+   * direction m1, m2, m3. nt is the number of iterations
+   */
+  int m;
+  if      (class_npb == 'S') { m = 24; }
+  else if (class_npb == 'W') { m = 25; }
+  else if (class_npb == 'A') { m = 28; }
+  else if (class_npb == 'B') { m = 30; }
+  else if (class_npb == 'C') { m = 32; }
+  else {
+    printf("setparams: Internal error: invalid class_npb type %c\n", class_npb);
+    exit(1);
+  }
+
+  fprintf(fp, "#define\tCLASS\t \'%c\'\n", class_npb);
+  fprintf(fp, "#define\tM\t%d\n", m);
+}
+
+
+/*
+ * This is a gross hack to allow the benchmarks to
+ * print out how they were compiled. Various other ways
+ * of doing this have been tried and they all fail on
+ * some machine - due to a broken "make" program, or
+ * F77 limitations, of whatever. Hopefully this will
+ * always work because it uses very portable C. Unfortunately
+ * it relies on parsing the make.def file - YUK.
+ * If your machine doesn't have <string.h> or <ctype.h>, happy hacking!
+ *
+ */
+
+#define VERBOSE
+#define LL 400
+#include <stdio.h>
+#define DEFFILE "../config/make.def"
+#define DEFAULT_MESSAGE "(none)"
+void write_compiler_info(int type, FILE *fp)
+{
+  FILE *deffile;
+  char line[LL];
+  char f77[LL], flink[LL], f_lib[LL], f_inc[LL], fflags[LL], flinkflags[LL];
+  char compiletime[LL], randfile[LL];
+  char cc[LL], cflags[LL], clink[LL], clinkflags[LL],
+       c_lib[LL], c_inc[LL];
+  struct tm *tmp;
+  time_t t;
+  deffile = fopen(DEFFILE, "r");
+  if (deffile == NULL) {
+    printf("\n\
+setparams: File %s doesn't exist. To build the NAS benchmarks\n\
+           you need to create is according to the instructions\n\
+           in the README in the main directory and comments in \n\
+           the file config/make.def.template\n", DEFFILE);
+    exit(1);
+  }
+  strcpy(f77, DEFAULT_MESSAGE);
+  strcpy(flink, DEFAULT_MESSAGE);
+  strcpy(f_lib, DEFAULT_MESSAGE);
+  strcpy(f_inc, DEFAULT_MESSAGE);
+  strcpy(fflags, DEFAULT_MESSAGE);
+  strcpy(flinkflags, DEFAULT_MESSAGE);
+  strcpy(randfile, DEFAULT_MESSAGE);
+  strcpy(cc, DEFAULT_MESSAGE);
+  strcpy(cflags, DEFAULT_MESSAGE);
+  strcpy(clink, DEFAULT_MESSAGE);
+  strcpy(clinkflags, DEFAULT_MESSAGE);
+  strcpy(c_lib, DEFAULT_MESSAGE);
+  strcpy(c_inc, DEFAULT_MESSAGE);
+
+  while (fgets(line, LL, deffile) != NULL) {
+    if (*line == '#') continue;
+    /* yes, this is inefficient. but it's simple! */
+    check_line(line, (char*)"F77", f77);
+    check_line(line, (char*)"FLINK", flink);
+    check_line(line, (char*)"F_LIB", f_lib);
+    check_line(line, (char*)"F_INC", f_inc);
+    check_line(line, (char*)"FFLAGS", fflags);
+    check_line(line, (char*)"FLINKFLAGS", flinkflags);
+    check_line(line, (char*)"RAND", randfile);
+    check_line(line, (char*)"CC", cc);
+    check_line(line, (char*)"CFLAGS", cflags);
+    check_line(line, (char*)"CLINK", clink);
+    check_line(line, (char*)"CLINKFLAGS", clinkflags);
+    check_line(line, (char*)"C_LIB",c_lib);
+    check_line(line, (char*)"C_INC", c_inc);
+  }
+
+
+  (void) time(&t);
+  tmp = localtime(&t);
+  (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp);
+
+
+  switch(type) {
+      case FT:
+      case SP:
+      case BT:
+      case MG:
+      case LU:
+      case EP:
+      case CG:
+          put_def_string(fp, (char*)"COMPILETIME", compiletime);
+          put_def_string(fp, (char*)"NPBVERSION", (char*)VERSION);
+          put_def_string(fp, (char*)"CS1", cc);
+          put_def_string(fp, (char*)"CS2", clink);
+          put_def_string(fp, (char*)"CS3", c_lib);
+          put_def_string(fp, (char*)"CS4", c_inc);
+          put_def_string(fp, (char*)"CS5", cflags);
+          put_def_string(fp, (char*)"CS6", clinkflags);
+	  put_def_string(fp, (char*)"CS7", randfile);
+          break;
+      case IS:
+      case DC:
+          put_def_string(fp, (char*)"COMPILETIME", compiletime);
+          put_def_string(fp, (char*)"NPBVERSION", (char*)VERSION);
+          put_def_string(fp, (char*)"CC", cc);
+          put_def_string(fp, (char*)"CFLAGS", cflags);
+          put_def_string(fp, (char*)"CLINK", clink);
+          put_def_string(fp, (char*)"CLINKFLAGS", clinkflags);
+          put_def_string(fp, (char*)"C_LIB", c_lib);
+          put_def_string(fp, (char*)"C_INC", c_inc);
+          break;
+      default:
+          printf("setparams: (Internal error): Unknown benchmark type %d\n",
+                                                                         type);
+          exit(1);
+  }
+
+}
+
+void check_line(char *line, char *label, char *val)
+{
+  char *original_line;
+  original_line = line;
+  /* compare beginning of line and label */
+  while (*label != '\0' && *line == *label) {
+    line++; label++;
+  }
+  /* if *label is not EOS, we must have had a mismatch */
+  if (*label != '\0') return;
+  /* if *line is not a space, actual label is longer than test label */
+  if (!isspace(*line) && *line != '=') return ;
+  /* skip over white space */
+  while (isspace(*line)) line++;
+  /* next char should be '=' */
+  if (*line != '=') return;
+  /* skip over white space */
+  while (isspace(*++line));
+  /* if EOS, nothing was specified */
+  if (*line == '\0') return;
+  /* finally we've come to the value */
+  strcpy(val, line);
+  /* chop off the newline at the end */
+  val[strlen(val)-1] = '\0';
+  if (val[strlen(val) - 1] == '\\') {
+    printf("\n\
+setparams: Error in file make.def. Because of the way in which\n\
+           command line arguments are incorporated into the\n\
+           executable benchmark, you can't have any continued\n\
+           lines in the file make.def, that is, lines ending\n\
+           with the character \"\\\". Although it may be ugly, \n\
+           you should be able to reformat without continuation\n\
+           lines. The offending line is\n\
+  %s\n", original_line);
+    exit(1);
+  }
+}
+
+int check_include_line(char *line, char *filename)
+{
+  char *include_string = (char*)"include";
+  /* compare beginning of line and "include" */
+  while (*include_string != '\0' && *line == *include_string) {
+    line++; include_string++;
+  }
+  /* if *include_string is not EOS, we must have had a mismatch */
+  if (*include_string != '\0') return(0);
+  /* if *line is not a space, first word is not "include" */
+  if (!isspace(*line)) return(0);
+  /* skip over white space */
+  while (isspace(*++line));
+  /* if EOS, nothing was specified */
+  if (*line == '\0') return(0);
+  /* next keyword should be name of include file in *filename */
+  while (*filename != '\0' && *line == *filename) {
+    line++; filename++;
+  }
+  if (*filename != '\0' ||
+      (*line != ' ' && *line != '\0' && *line !='\n')) return(0);
+  else return(1);
+}
+
+
+#define MAXL 146
+void put_string(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name);
+  fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val);
+}
+
+/* NOTE: is the ... stuff necessary in C? */
+void put_def_string(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "#define %s \"%s\"\n", name, val);
+}
+
+void put_def_variable(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "#define %s %s\n", name, val);
+}
+
+
+
+#if 0
+
+/* this version allows arbitrarily long lines but
+ * some compilers don't like that and they're rarely
+ * useful
+ */
+
+#define LINELEN 65
+void put_string(FILE *fp, char *name, char *val)
+{
+  int len, nlines, pos, i;
+  char line[100];
+  len = strlen(val);
+  nlines = len/LINELEN;
+  if (nlines*LINELEN < len) nlines++;
+  fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name);
+  fprintf(fp, "%sparameter (%s = \n", FINDENT, name);
+  for (i = 0; i < nlines; i++) {
+    pos = i*LINELEN;
+    if (i == 0) fprintf(fp, "%s\'", CONTINUE);
+    else        fprintf(fp, "%s", CONTINUE);
+    /* number should be same as LINELEN */
+    fprintf(fp, "%.65s", val+pos);
+    if (i == nlines-1) fprintf(fp, "\')\n");
+    else             fprintf(fp, "\n");
+  }
+}
+
+#endif
+
+
+/* integer log base two. Return error is argument isn't
+ * a power of two or is less than or equal to zero
+ */
+
+int ilog2(int i)
+{
+  int log2;
+  int exp2 = 1;
+  if (i <= 0) return(-1);
+
+  for (log2 = 0; log2 < 20; log2++) {
+    if (exp2 == i) return(log2);
+    exp2 *= 2;
+  }
+  return(-1);
+}
+
+
+
+void write_convertdouble_info(int type, FILE *fp)
+{
+  switch(type) {
+  case SP:
+  case BT:
+  case LU:
+  case FT:
+  case MG:
+  case EP:
+  case CG:
+#ifdef CONVERTDOUBLE
+    fprintf(fp, "#define\tCONVERTDOUBLE\tTRUE\n");
+#else
+    fprintf(fp, "#define\tCONVERTDOUBLE\tFALSE\n");
+#endif
+    break;
+  }
+}