From 605cff94ffaf0a865195f173ecbf088188d1fd15 Mon Sep 17 00:00:00 2001
From: Patrick Flick <patrick.flick@gmail.com>
Date: Tue, 3 Oct 2017 15:46:15 -0400
Subject: [PATCH 1/4] added benchmark executable

---
 src/CMakeLists.txt   |  3 ++
 src/benchmark_bw.cpp | 65 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 src/benchmark_bw.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ff34356..086d79c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,3 +5,6 @@ project(mxx-bm)
 
 add_executable(mxx-bm-vote-off vote_off.cpp)
 target_link_libraries(mxx-bm-vote-off ${MPI_LIBRARIES})
+
+add_executable(mxx-benchmark benchmark_bw.cpp)
+target_link_libraries(mxx-benchmark ${MPI_LIBRARIES})
diff --git a/src/benchmark_bw.cpp b/src/benchmark_bw.cpp
new file mode 100644
index 0000000..5dd1876
--- /dev/null
+++ b/src/benchmark_bw.cpp
@@ -0,0 +1,65 @@
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/benchmark.hpp>
+#include <mxx/utils.hpp>
+
+#include <ext/cxx-prettyprint/prettyprint.hpp>
+
+std::string exec_name;
+
+void print_usage() {
+    std::cerr << "Usage: " << exec_name << " <n> <out-node-filename>" << std::endl;
+    std::cerr << "where" << std::endl;
+    std::cerr << "    <n>                     Number of nodes to vote off." << std::endl;
+    std::cerr << "    <out-node-filename>     Filename for the new nodefile, output by this program." << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    mxx::env e(argc, argv);
+    mxx::comm comm;
+
+    // print out node and rank distribution
+    mxx::print_node_distribution(comm);
+
+    // create shared-mem MPI+MPI hybrid communicator
+    mxx::hybrid_comm hc(comm);
+
+    // assert same number processors per node
+    int proc_per_node = hc.local.size();
+    if (!mxx::all_same(proc_per_node, comm)) {
+        std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    // assert we have an even number of nodes
+    int num_nodes = hc.num_nodes();
+    if (num_nodes % 2 != 0) {
+        std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    /*
+    // parse input arguments
+    exec_name = argv[0];
+    if (argc < 3) {
+        print_usage();
+        MPI_Abort(comm, -1);
+    }
+    int n_vote_off = atoi(argv[1]);
+    std::string output_nodefile(argv[2]);
+    if (n_vote_off < 0) {
+        print_usage();
+        MPI_Abort(comm, -1);
+    }
+    */
+
+
+    std::vector<double> bw_row = mxx::pairwise_bw_matrix(hc);
+
+    mxx::print_bw_matrix_stats(hc, bw_row);
+
+    mxx::bw_all2all(hc.global, hc.local);
+
+    return 0;
+}

From 492ab47be718a635faa4b46584d3ef4c442b7364 Mon Sep 17 00:00:00 2001
From: Patrick Flick <patrick.flick@gmail.com>
Date: Fri, 15 Dec 2017 14:29:15 -0500
Subject: [PATCH 2/4] benchmarks

---
 include/mxx/benchmark.hpp  | 487 ++++++++++++++++++++++++++-----------
 include/mxx/samplesort.hpp |   7 +-
 include/mxx/stream.hpp     |   8 +
 src/CMakeLists.txt         |  16 +-
 src/benchmark_a2a.cpp      |  89 +++++++
 src/benchmark_bw.cpp       |  65 -----
 src/benchmark_p2p_bw.cpp   |  82 +++++++
 src/benchmark_sort.cpp     |  54 ++++
 src/pbs_run.sh             |  24 ++
 src/vote_off.cpp           |   2 +-
 test/CMakeLists.txt        |   3 -
 test/benchmarks.cpp        |  22 --
 12 files changed, 621 insertions(+), 238 deletions(-)
 create mode 100644 src/benchmark_a2a.cpp
 delete mode 100644 src/benchmark_bw.cpp
 create mode 100644 src/benchmark_p2p_bw.cpp
 create mode 100644 src/benchmark_sort.cpp
 create mode 100644 src/pbs_run.sh
 delete mode 100644 test/benchmarks.cpp

diff --git a/include/mxx/benchmark.hpp b/include/mxx/benchmark.hpp
index a621d5b..23de508 100644
--- a/include/mxx/benchmark.hpp
+++ b/include/mxx/benchmark.hpp
@@ -79,6 +79,21 @@ class hybrid_comm {
         return result;
     }
 
+    // split the local communicators in the same way on all processes
+    // This splits the `local` and `global` communicator and leaves the
+    // `local_master` as is, assuming that `color` is identical for all
+    // processes in a `local_master`.
+    hybrid_comm split_local(int color) const {
+        // split the processes but assert that each node is only in
+        // one process
+        MXX_ASSERT(mxx::all_same(color, local_master));
+        hybrid_comm result;
+        result.local = local.split(color);
+        result.local_master = local_master.copy();
+        result.global = global.split(color);
+        return result;
+    }
+
     // move constructor moves all members
     hybrid_comm(hybrid_comm&& o) = default;
 
@@ -100,6 +115,26 @@ class hybrid_comm {
         global.barrier();
     }
 
+    // executes only with `ppn` processes per node
+    template <typename Func>
+    void with_ppn(int ppn, Func func) const {
+        MXX_ASSERT(mxx::all_same(ppn, global));
+        MXX_ASSERT(mxx::all_of(ppn <= local.size()));
+
+        int participate = local.rank() < ppn;
+        if (mxx::all_same(participate, global)) {
+            if (participate) {
+                func(*this);
+            }
+        } else {
+            hybrid_comm hc(split_local(participate));
+            if (participate) {
+                func(hc);
+            }
+        }
+        global.barrier();
+    }
+
     int num_nodes() const {
         return global.size() / local.size();
     }
@@ -152,25 +187,27 @@ std::pair<double,double> bw_simplex(const mxx::comm& c, int partner, size_t n) {
     return std::pair<double,double>(bw1, bw2);
 }
 
+/**
+ * Collectively times a duplex sendrecv with a given partner rank.
+ */
 template <typename T>
 double time_duplex(const mxx::comm& c, int partner, const std::vector<T>& sendvec, std::vector<T>& recvvec) {
     MXX_ASSERT(sendvec.size() == recvvec.size());
-
     size_t n = sendvec.size();
+    mxx::datatype dt = mxx::get_datatype<size_t>();
     c.barrier();
     auto start = std::chrono::steady_clock::now();
-    mxx::datatype dt = mxx::get_datatype<size_t>();
     // sendrecv for full duplex
     MPI_Sendrecv(const_cast<T*>(&sendvec[0]), n, dt.type(), partner, 0, &recvvec[0], n, dt.type(), partner, 0, c, MPI_STATUS_IGNORE);
-    //c.barrier();
     auto end = std::chrono::steady_clock::now();
     double time_p2p = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
-
-    // calculate duplex bandwidth
-    //double bw = 2*8*(double)n*sizeof(size_t)/time_p2p/1000.0;
     return time_p2p;
 }
 
+/**
+ * Times duplex sendrecv with a partner process and compute the
+ * duplex bandwidth per node.
+ */
 template <typename T>
 double bw_duplex_per_node(const mxx::comm& c, int partner, const mxx::comm& smc, const std::vector<T>& sendvec, std::vector<T>& recvvec) {
     size_t n = sendvec.size();
@@ -181,82 +218,133 @@ double bw_duplex_per_node(const mxx::comm& c, int partner, const mxx::comm& smc,
 }
 
 
-void bm(const mxx::comm& c, int partner, const mxx::comm& smc) {
 
-    size_t n = 10000000;
-    n /= smc.size();
-    size_t MB = (n*sizeof(size_t))/1024/1024;
-    std::vector<size_t> vec(n);
-    std::vector<size_t> result(n);
-    std::generate(vec.begin(), vec.end(), std::rand);
 
-    std::string node_name = get_processor_name();
-
-    double timed = time_duplex(c, partner, vec, result);
-    double bw_send, bw_recv;
-    std::tie(bw_send, bw_recv) = bw_simplex(c, partner, n);
-
-    // calculate BW
-    double maxtime_duplex = mxx::allreduce(timed, mxx::max<double>(), smc);
-    double sum_bwd = 2*8*n*sizeof(size_t)*smc.size()/maxtime_duplex/1000.0;
-    double sum_bw_send = mxx::allreduce(bw_send, std::plus<double>(), smc);
-    double sum_bw_recv = mxx::allreduce(bw_recv, std::plus<double>(), smc);
-    c.with_subset(smc.rank() == 0, [&](const mxx::comm& subcomm) {
-        mxx::sync_cout(subcomm) << "[" << node_name << "]: Node BW Duplex = "
-        << sum_bwd << " Gb/s (Simplex " << sum_bw_send << " Gb/s send, " << sum_bw_recv
-        << " Gb/s recv) [" << MB*smc.size() << " MiB]" << std::endl;
+
+/**
+ * @brief Executes the given function f(i) for each pair of processes where
+ *        there are `size` processes and this process has rank `rank`.
+ *        Processes are paired such that in every iteration
+ *        f(i) is called on rank `rank` iff simultanously f(rank) is called on
+ *        rank `i`.
+ *
+ *        The function f(i) is called once for all ranks 0,...,size-1,
+ *        excluding i = rank.
+ *
+ *        If in any iteration, no partner is found, f(-1) is called. This happens
+ *        if for example the number of processes `size` is an odd number.
+ *
+ * @param rank  The rank of this process
+ * @param size  The number of total processes
+ * @param f     The function to be called for each paired rank.
+ */
+template <typename F>
+void pairwise_func(int rank, int size, F f) {
+    // pair up blocks of size 2^i in iteration i
+    // for each pair of blocks, pair up all combinations via a linear offset
+    for (int dist = 1; dist < size; dist <<= 1) {
+        int partner_block;
+        if ((rank / dist) % 2 == 0) {
+            // to left block
+            partner_block = (rank/dist + 1)*dist;
+        } else {
+            partner_block = (rank/dist - 1)*dist;
+        }
+        int inblock_idx = rank % dist;
+        for (int i = 0; i < dist; ++i) {
+            int partner;
+            if (partner_block >= rank)
+                partner = partner_block + (inblock_idx + i) % dist;
+            else
+                partner = partner_block + (inblock_idx + (dist - i)) % dist;
+            if (partner < size) {
+                f(partner);
+            } else {
+                // this rank doesn't have a partner in this iteration
+                f(-1);
+            }
+        }
+    }
+}
+
+
+/**
+ * @brief Calls f(c, partner) for each pair of processes and f(-1) if there is
+ *        no partner in any given round. (see `pairwise_func(int,int,F)`)
+ */
+template <typename F>
+void pairwise_func(const mxx::comm& c, F f) {
+    pairwise_func(c.rank(), c.size(), [&](int partner){ f(c, partner);});
+}
+
+// called once for each pair of nodes
+// if there are multiple processes per node, the processes are matched
+// up by their local ranks
+// The given function is called using global ranks as the partner index
+// If in any iteration there is no partner node, this calls f(-1)
+template <typename F>
+void pairwise_nodes_func(const hybrid_comm& hc, F f) {
+    MXX_ASSERT(mxx::all_same(hc.local.size()));
+    pairwise_func(hc.node_rank(), hc.num_nodes(), [&](int partner_node) {
+        if (partner_node >= 0) {
+            int partner_rank = partner_node * hc.local.size() + hc.local.rank();
+            f(partner_rank);
+        } else {
+            f(-1);
+        }
     });
 }
 
 // returns a row of pairwise bw benchmark results on each process where smc.rank() == 0
-std::vector<double> pairwise_bw_matrix(const hybrid_comm& hc) {
-    int num_nodes = hc.num_nodes();
-    size_t n = 1000000;
+std::vector<double> pairwise_bw_matrix(const hybrid_comm& hc, size_t msg_size) {
+    size_t n = msg_size / 8;
     std::vector<size_t> vec(n);
     std::vector<size_t> result(n);
     std::generate(vec.begin(), vec.end(), std::rand);
-    n /= hc.local.size();
+
     // nodes get partnered
-    int node_idx = hc.node_rank();
     std::vector<double> bw_row;
     if (hc.is_local_master())
-        bw_row.resize(num_nodes);
-    for (int dist = 1; dist < num_nodes; dist <<= 1) {
-        if (hc.global.rank() == 0) {
-            std::cout << "Benchmarking p2p duplex for dist = " << dist << std::endl;
-        }
-        int partner_block;
-        if ((node_idx / dist) % 2 == 0) {
-            // to left block
-            partner_block = (node_idx/dist + 1)*dist;
-        } else {
-            partner_block = (node_idx/dist - 1)*dist;
-        }
-        int inblock_idx = node_idx % dist;
-        for (int i = 0; i < dist; ++i) {
-            int partner_node;
-            if (partner_block >= node_idx)
-                partner_node = partner_block + (inblock_idx + i) % dist;
-            else
-                partner_node = partner_block + (inblock_idx + (dist - i)) % dist;
-            int partner = partner_node*hc.local.size() + hc.local.rank();
-            // benchmark duplex with partner
-            if (partner_node < num_nodes) {
-                double bw = bw_duplex_per_node(hc.global, partner, hc.local, vec, result);
-                if (hc.is_local_master()) {
-                    bw_row[partner_node] = bw;
-                }
-            } else {
-                // if this node doesn't participate in the benchmarking, it
-                // still needs to call the barrier that is otherwise called
-                // inside the time_duplex function
-                hc.global.barrier();
+        bw_row.resize(hc.num_nodes());
+
+    pairwise_nodes_func(hc, [&](int partner){
+        if (partner >= 0) {
+            double bw = bw_duplex_per_node(hc.global, partner, hc.local, vec, result);
+            if (hc.is_local_master()) {
+                int partner_node = partner / hc.local.size();
+                bw_row[partner_node] = bw;
             }
+        } else {
+            // if this node doesn't participate in the benchmarking, it
+            // still needs to call the barrier that is otherwise called
+            // inside the time_duplex function
+            hc.global.barrier();
         }
-    }
+    });
+
     return bw_row;
 }
 
+
+void save_matrix_pernode(const hybrid_comm& hc, const std::string& filename, const std::vector<double>& values) {
+    std::ofstream of;
+    if (hc.global.rank() == 0) {
+        of.open(filename);
+    }
+    hc.with_local_master([&](){
+        std::stringstream ss;
+        ss << hc.node_name << ",";
+        ss << std::fixed << std::setprecision(2);
+        for (size_t i = 0; i < values.size(); ++i) {
+            ss << values[i];
+            if (i+1 < values.size())
+                ss << ",";
+        }
+        // create sync stream
+        mxx::sync_os(hc.local_master, of) << ss.str() << std::endl;
+    });
+}
+
 void print_bw_matrix_stats(const hybrid_comm& hc, const std::vector<double>& bw_row) {
     // print matrix
     hc.with_local_master([&](){
@@ -402,10 +490,199 @@ void write_new_nodefile(const hybrid_comm& hc, bool participate, const std::stri
 }
 
 
+// benchmark all2all function that outputs to csv file:
+// n (in bytes), p, nnodes, m, ppn, min_time, avg_time, max_time
+
+// m: message size, total data send from one process: m*comm.size()
+// n = m*p^2
+// returns the time taken by this process for the all2all in microseconds
+template <typename T>
+double timed_all2all_impl(const mxx::comm& c, size_t m) {
+    // create arrays and generate random data
+    std::vector<T> els(m*c.size());
+    //std::generate(els.begin(), els.end(), [](){ return std::rand() % 255; });
+    std::vector<T> rcv(m*c.size());
+    mxx::datatype dt = mxx::get_datatype<T>();
+
+    /* run the actual MPI_Alltoall and time it */
+    c.barrier();
+    auto start = std::chrono::steady_clock::now();
+    //MPI_Alltoall(&els[0], m, dt.type(), &rcv[0], m, dt.type(), c);
+    mxx::all2all(&els[0], m, &rcv[0], c);
+    auto end = std::chrono::steady_clock::now();
+
+    /* return the time taken on this process in microseconds */
+    double time_all2all = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+    return time_all2all;
+}
+
+double timed_all2all(const mxx::comm& c, size_t m) {
+    if (m >= 8 && m % 8 == 0) {
+        return timed_all2all_impl<uint64_t>(c, m/8);
+    } else if (m >= 4 && m % 4 == 0) {
+        return timed_all2all_impl<uint32_t>(c, m/4);
+    } else {
+        return timed_all2all_impl<unsigned char>(c, m);
+    }
+}
+
+void bm_all2all(const mxx::hybrid_comm& hc, std::ostream& os, size_t max_size_per_node) {
+    const mxx::comm& c = hc.global;
+    assert(sizeof(size_t) == 8);
+    if (hc.global.rank() == 0) {
+        std::cerr << "bm_all2all with np = " << hc.global.size() << ", ppn = " << hc.local.size() << ", nnodes = " << hc.num_nodes() << std::endl;
+    }
+    for (size_t m = 1; (m*c.size()*hc.local.size()) <= max_size_per_node; m <<= 1) {
+        if (m >= std::numeric_limits<int>::max())
+            break;
+        size_t np = m*c.size(); // data per process
+        size_t n = np*c.size(); // total number of bytes globally
+
+        if (hc.global.rank() == 0) {
+            std::cerr << "bm_all2all         m = " << m << ", n/p = " << np << ", n = " << n << ", n/node = " << (m*c.size()*hc.local.size()) << std::endl;
+        }
+
+        double time_all2all = timed_all2all(c, m);
+
+        /* get min, max and average */
+        double max_time = mxx::allreduce(time_all2all, mxx::max<double>(), c);
+        double min_time = mxx::allreduce(time_all2all, mxx::min<double>(), c);
+        double avg_time = mxx::allreduce(time_all2all, c) / c.size();
+
+        // p, q, ppn, m, n, time_min, time_avg, time_max
+        if (c.rank() == 0) {
+            os << c.size() << "," << hc.num_nodes() << "," << hc.local.size() << "," << m << "," << n << "," << min_time << "," << avg_time << "," << max_time << std::endl;
+        }
+    }
+}
+
+#if 0
+template <typename F>
+void bm_coll_function(const mxx::hybrid_comm& hc, std::ostream& os, F f, size_t max_size_per_node) {
+    const mxx::comm& c = hc.global;
+    for (size_t m = 1; (m*c.size()*hc.local.size()) <= max_size_per_node; m <<= 1) {
+        size_t np = m*c.size(); // max data per process
+        size_t n = np*c.size(); // max total number of bytes globally (all2all and allgather)
+
+        // execute the timed function
+        double val = f(c, m);
+
+        /* get min, max and average */
+        double max_val = mxx::allreduce(val, mxx::max<double>(), comm);
+        double min_val = mxx::allreduce(val, mxx::min<double>(), comm);
+        double avg_val = mxx::allreduce(val, comm) / comm.size();
+
+        // p, q, ppn, m, n, time_min, time_avg, time_max
+        if (c.rank() == 0) {
+            os << c.size() << "," << hc.num_nodes() << "," << hc.local.size() << "," << m << "," << n << "," << min_time << "," << avg_time << "," << max_time << std::endl;
+        }
+    }
+}
+#endif
+
+// next smaller power of 2
+uint32_t flp2 (uint32_t x)
+{
+    x = x | (x >> 1);
+    x = x | (x >> 2);
+    x = x | (x >> 4);
+    x = x | (x >> 8);
+    x = x | (x >> 16);
+    return x - (x >> 1);
+}
+
+
+/**
+ * @brief Executes the given function `f(hybrid_comm hc_ppn)` for all powers of
+ *        2 between the total ppn downto 1.
+ *
+ * @param hc    Hybrid communicator which is split for all `ppn` values.
+ * @param f     The function called for each `ppn` value.
+ */
+template <typename F>
+void forall_p2_ppn(const mxx::hybrid_comm& hc, F f) {
+    int ppn = hc.local.size();
+    MXX_ASSERT(mxx::all_same(ppn, hc.global));
+    for (int q = ppn; q >= 1; q = flp2(q-1)) {
+        // split by ppn
+        hc.with_ppn(q, std::forward<F>(f));
+    }
+}
+
+template <typename F>
+void forall_p2_nnodes_and_ppn(const mxx::hybrid_comm& hc, F f) {
+    int ppn = hc.local.size();
+    MXX_ASSERT(mxx::all_same(ppn, hc.global));
+
+    // in decreasing powers of two
+    for (int nn = hc.num_nodes(); nn >= 2; nn = flp2(nn-1)) {
+        // split by nodes
+        hc.with_nodes(hc.node_rank() < nn, [&](const mxx::hybrid_comm& hcn) {
+            for (int q = ppn; q >= 1; q = flp2(q-1)) {
+                // split by ppn
+                hcn.with_ppn(q, std::forward<F>(f));
+            }
+        });
+    }
+}
+
+void bm_all2all_forall_q(const mxx::hybrid_comm& hc, std::ostream& os, size_t max_size_per_node) {
+    int ppn = hc.local.size();
+    MXX_ASSERT(mxx::all_same(ppn, hc.global));
+
+    for (int q = ppn; q >= 1; q = flp2(q)) {
+        hc.with_ppn(q, [&](const mxx::hybrid_comm& hcq) {
+            bm_all2all(hcq, os, max_size_per_node);
+        });
+    }
+}
+
+// TODO: this isn't working yet
+double ping(const mxx::comm& c, int partner, int rounds = 100) {
+    // pairwise ping measurement with this process and process of rank `partner`
+
+    int msg = 0;
+    std::chrono::steady_clock::time_point rcv_tp, send_tp;
+    MPI_Status st;
+
+    if (c.rank() < partner) {
+        // i initiate first ping
+        send_tp = std::chrono::steady_clock::now();
+        MPI_Send(&msg, 1, MPI_INT, partner, 0, c);
+    } else {
+        //MPI_Recv(&msg, 1, MPI_INT, partner, MPI_ANY_TAG, c, &st);
+    }
+
+    int ping_cnt = 0;
+    double ping_sum = 0.;
+    for (int i = 0; i <= rounds; ++i) {
+            MPI_Recv(&msg, 1, MPI_INT, partner, MPI_ANY_TAG, c, &st);
+            rcv_tp = std::chrono::steady_clock::now();
+            int t = st.MPI_TAG;
+            if (t > 0) {
+                    // save time diff
+                    double time_diff = std::chrono::duration_cast<std::chrono::microseconds>(rcv_tp - send_tp).count();
+                    ping_cnt += 1;
+                    ping_sum += time_diff;
+            }
+            if (t < 2*rounds) {
+                    send_tp = std::chrono::steady_clock::now();
+                    MPI_Send(&msg, 1, MPI_INT, partner, t+1, c);
+            }
+            if (t+1 == 2*rounds)
+                    break;
+    }
+    assert(ping_cnt == rounds);
+    return ping_sum / ping_cnt;
+}
+
+
+
 void bw_all2all(const mxx::comm& c, const mxx::comm& smc) {
     // message size per target processor
     for (int k = 1; k <= 16; k <<= 1) {
-        int m = k*1024;
+        size_t n = k*1024*1024;
+        int m = n/c.size();
         std::vector<size_t> els(m*c.size());
         std::generate(els.begin(), els.end(), std::rand);
         std::vector<size_t> rcv(m*c.size());
@@ -418,11 +695,11 @@ void bw_all2all(const mxx::comm& c, const mxx::comm& smc) {
         double time_all2all = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
         double max_time = mxx::allreduce(time_all2all, mxx::max<double>(), c);
         double min_time = mxx::allreduce(time_all2all, mxx::min<double>(), c);
-        size_t bits_sendrecv = 2*8*sizeof(size_t)*m*(c.size() - smc.size());
+        size_t bits_sendrecv = 2*8*sizeof(size_t)*smc.size()*m*(c.size() - smc.size());
         // bandwidth in Gb/s
         double bw = bits_sendrecv / max_time / 1000.0;
         if (c.rank() == 0) {
-            std::cout << "All2all bandwidth: " << bw << " Gb/s [min=" << min_time/1000.0 << " ms, max=" << max_time/1000.0 << " ms, local_size=" << bits_sendrecv/1024/1024 << " MiB]" << std::endl;
+            std::cout << "All2all bandwidth (per node) " << bw << " Gb/s [min=" << min_time/1000.0 << " ms, max=" << max_time/1000.0 << " ms, local_size=" << bits_sendrecv/1024/1024 << " MiB]" << std::endl;
         }
     }
 }
@@ -516,80 +793,6 @@ void bw_all2all_unaligned_char(const mxx::comm& c, const mxx::comm& smc, bool re
     }
 }
 
-void benchmark_nodes_bw_p2p(const mxx::comm& comm = mxx::comm()) {
-    // pair with another node
-    hybrid_comm hc(comm);
-    int proc_per_node = hc.local.size();
-
-    // assert same number processors per node
-    if (!mxx::all_same(proc_per_node, comm)) {
-        std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl;
-        MPI_Abort(comm, -1);
-    }
-
-    int num_nodes = hc.num_nodes();
-
-    if (num_nodes % 2 != 0) {
-        std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl;
-        MPI_Abort(comm, -1);
-    }
-
-    //for (int local_p = 1; local_p <= proc_per_node; local_p <<= 1) {
-    /*
-    int local_p = sm_comm.size();
-        bool participate =  true; //sm_comm.rank() < local_p;
-        mxx::comm c = comm.split(participate, node_idx*local_p + sm_comm.rank());
-        */
-     //   mxx::comm smc = sm_comm.split(participate);
-
-        if (true) {
-            std::vector<double> bw_row = pairwise_bw_matrix(hc);
-            print_bw_matrix_stats(hc, bw_row);
-            bool part = vote_off(hc, 4, bw_row); // TODO: process result
-            if (hc.global.rank() == 0)
-                std::cout << "Before vote off: " << std::endl;
-            bw_all2all(hc.global, hc.local);
-            if (hc.global.rank() == 0)
-                std::cout << "After vote off: " << std::endl;
-            hc.with_nodes(part, [&](const hybrid_comm& subhc) {
-                bw_all2all(subhc.global, subhc.local);
-                bw_all2all_char(subhc.global, subhc.local);
-                bw_all2all_unaligned_char(subhc.global, subhc.local, false);
-                if (subhc.global.rank() == 0)
-                    std::cout << "== With re-alignment" << std::endl;
-                bw_all2all_unaligned_char(subhc.global, subhc.local, true);
-            });
-            write_new_nodefile(hc, part, "blah.nodes");
-
-            /*
-            if (c.rank() == 0) {
-                std::cout << "Running with " << local_p << "/" << proc_per_node << " processes per node" << std::endl;
-            }
-            MXX_ASSERT(c.size() % 2 == 0);
-            if (local_p > 1) {
-                // intranode BW test
-                if (c.rank() == 0)
-                    std::cout << "Intranode BW test" << std::endl;
-                int partner = (c.rank() % 2 == 0) ? c.rank() + 1 : c.rank() - 1;
-                bm(c, partner, smc);
-            }
-            // 1) closest neighbor
-            if (c.rank() == 0)
-                std::cout << "Closest Neighbor BW test" << std::endl;
-            int partner = (node_idx % 2 == 0) ? c.rank() + local_p : c.rank() - local_p;
-            bm(c, partner, smc);
-
-            // 2) furthest neighbor
-            if (c.rank() == 0)
-                std::cout << "Furthest Neighbor BW test" << std::endl;
-            partner = (c.rank() < c.size()/2) ? c.rank() + c.size()/2 : c.rank() - c.size()/2;
-            bm(c, partner, smc);
-            */
-        }
-    //}
-    // wait for other processes to finish the benchmarking
-    //comm.barrier();
-}
 
 } // namespace mxx
 
diff --git a/include/mxx/samplesort.hpp b/include/mxx/samplesort.hpp
index 59d850f..3df9aae 100644
--- a/include/mxx/samplesort.hpp
+++ b/include/mxx/samplesort.hpp
@@ -47,8 +47,11 @@
 #endif
 
 
-#define SS_ENABLE_TIMER 0
-#if SS_ENABLE_TIMER
+#ifndef MXX_SAMPLESORT_TIMER
+#define MXX_SAMPLESORT_TIMER 0
+#endif
+
+#if MXX_SAMPLESORT_TIMER
 #include "timer.hpp"
 #define SS_TIMER_START(comm) mxx::section_timer timer(std::cerr, comm, 0);
 #define SS_TIMER_END_SECTION(str) timer.end_section(str);
diff --git a/include/mxx/stream.hpp b/include/mxx/stream.hpp
index 9104bf4..c57391f 100644
--- a/include/mxx/stream.hpp
+++ b/include/mxx/stream.hpp
@@ -100,6 +100,14 @@ inline sync_ostream sync_cerr(const mxx::comm& comm, int root = 0) {
     return comm.rank() == root ? sync_ostream(comm, root, std::cerr) : sync_ostream(comm, root);
 }
 
+template <typename base_stream>
+inline sync_ostream sync_os(const mxx::comm& comm, base_stream& bs, int root = 0) {
+    return comm.rank() == root ? sync_ostream(comm, root, bs) : sync_ostream(comm, root);
+};
+
+// TODO: a sync ofstream
+// TODO: a sync cout/cerr which writes [rank] before every line/msg
+
 } // namespace mxx
 
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 086d79c..123d76d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 2.8)
 
 # project settings
 project(mxx-bm)
@@ -6,5 +6,15 @@ project(mxx-bm)
 add_executable(mxx-bm-vote-off vote_off.cpp)
 target_link_libraries(mxx-bm-vote-off ${MPI_LIBRARIES})
 
-add_executable(mxx-benchmark benchmark_bw.cpp)
-target_link_libraries(mxx-benchmark ${MPI_LIBRARIES})
+# benchmark p2p bandwidth
+add_executable(mxx-benchmark-p2p-bw benchmark_p2p_bw.cpp)
+target_link_libraries(mxx-benchmark-p2p-bw ${MPI_LIBRARIES})
+
+# benchmark all2all
+add_executable(mxx-benchmark-a2a benchmark_a2a.cpp)
+target_link_libraries(mxx-benchmark-a2a ${MPI_LIBRARIES})
+
+# benchmark parallel sorting
+add_executable(mxx-sort-benchmark benchmark_sort.cpp)
+target_compile_definitions(mxx-sort-benchmark PUBLIC -DMXX_SAMPLESORT_TIMER=1)
+target_link_libraries(mxx-sort-benchmark ${MPI_LIBRARIES})
diff --git a/src/benchmark_a2a.cpp b/src/benchmark_a2a.cpp
new file mode 100644
index 0000000..b9342f9
--- /dev/null
+++ b/src/benchmark_a2a.cpp
@@ -0,0 +1,89 @@
+#include <string>
+#include <sstream>
+#include <fstream>
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/benchmark.hpp>
+#include <mxx/utils.hpp>
+
+#include <ext/cxx-prettyprint/prettyprint.hpp>
+
+std::string exec_name;
+
+void print_usage() {
+    std::cerr << "Usage: " << exec_name << " -m <msg_size> <output-file>" << std::endl;
+    std::cerr << "where" << std::endl;
+    std::cerr << "    <output-file>     (optional) Filename for the benchmark results (default: 'all2all_benchmark.csv')." << std::endl;
+    std::cerr << " -m <max_mem>         (optional) Maximum message space per node (in GB) (default: 32)." << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    mxx::env e(argc, argv);
+    mxx::comm comm;
+
+    // print out node and rank distribution
+    mxx::print_node_distribution(comm);
+
+    // create shared-mem MPI+MPI hybrid communicator
+    mxx::hybrid_comm hc(comm);
+
+    // assert same number processors per node
+    int proc_per_node = hc.local.size();
+    if (!mxx::all_same(proc_per_node, comm)) {
+        std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    // assert we have an even number of nodes
+    int num_nodes = hc.num_nodes();
+    if (num_nodes > 1 && num_nodes % 2 != 0) {
+        std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    // default args
+    size_t mem_per_node_gb = 32; // setting the max experiment at 32 GB per node
+    std::string filename = "all2all_benchmark.csv";
+
+    // parse input arguments
+    exec_name = argv[0];
+    argv++; argc--;
+    if (argc >= 2) {
+        std::string x(argv[0]);
+        if (x == "-m") {
+            mem_per_node_gb = atoi(argv[1]);
+            argv+=2; argc-=2;
+        }
+        if (x != "-m" || mem_per_node_gb > 1024 || mem_per_node_gb == 0) {
+            print_usage();
+            MPI_Abort(comm, -1);
+        }
+    }
+    if (argc > 0) {
+        filename = argv[0];
+        argv++; argc--;
+    }
+    if (argc > 0) {
+            print_usage();
+            MPI_Abort(comm, -1);
+    }
+
+    MXX_ASSERT(mxx::all_same(mem_per_node_gb, comm));
+
+    // benchmark all:
+    std::ofstream of;
+    if (hc.global.rank() == 0) {
+        of.open(filename);
+        of << "p,nnodes,q,m,n,min,avg,max" << std::endl;
+    }
+
+    // 32 GB/node max?
+    size_t mempernode = mem_per_node_gb << 30;
+
+    mxx::forall_p2_nnodes_and_ppn(hc, [&](const mxx::hybrid_comm& hc){
+        bm_all2all(hc, of, mempernode);
+    });
+
+    return 0;
+}
diff --git a/src/benchmark_bw.cpp b/src/benchmark_bw.cpp
deleted file mode 100644
index 5dd1876..0000000
--- a/src/benchmark_bw.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-
-#include <mxx/env.hpp>
-#include <mxx/comm.hpp>
-#include <mxx/benchmark.hpp>
-#include <mxx/utils.hpp>
-
-#include <ext/cxx-prettyprint/prettyprint.hpp>
-
-std::string exec_name;
-
-void print_usage() {
-    std::cerr << "Usage: " << exec_name << " <n> <out-node-filename>" << std::endl;
-    std::cerr << "where" << std::endl;
-    std::cerr << "    <n>                     Number of nodes to vote off." << std::endl;
-    std::cerr << "    <out-node-filename>     Filename for the new nodefile, output by this program." << std::endl;
-}
-
-int main(int argc, char* argv[]) {
-    mxx::env e(argc, argv);
-    mxx::comm comm;
-
-    // print out node and rank distribution
-    mxx::print_node_distribution(comm);
-
-    // create shared-mem MPI+MPI hybrid communicator
-    mxx::hybrid_comm hc(comm);
-
-    // assert same number processors per node
-    int proc_per_node = hc.local.size();
-    if (!mxx::all_same(proc_per_node, comm)) {
-        std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl;
-        MPI_Abort(comm, -1);
-    }
-
-    // assert we have an even number of nodes
-    int num_nodes = hc.num_nodes();
-    if (num_nodes % 2 != 0) {
-        std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl;
-        MPI_Abort(comm, -1);
-    }
-
-    /*
-    // parse input arguments
-    exec_name = argv[0];
-    if (argc < 3) {
-        print_usage();
-        MPI_Abort(comm, -1);
-    }
-    int n_vote_off = atoi(argv[1]);
-    std::string output_nodefile(argv[2]);
-    if (n_vote_off < 0) {
-        print_usage();
-        MPI_Abort(comm, -1);
-    }
-    */
-
-
-    std::vector<double> bw_row = mxx::pairwise_bw_matrix(hc);
-
-    mxx::print_bw_matrix_stats(hc, bw_row);
-
-    mxx::bw_all2all(hc.global, hc.local);
-
-    return 0;
-}
diff --git a/src/benchmark_p2p_bw.cpp b/src/benchmark_p2p_bw.cpp
new file mode 100644
index 0000000..6339ed5
--- /dev/null
+++ b/src/benchmark_p2p_bw.cpp
@@ -0,0 +1,82 @@
+#include <string>
+#include <sstream>
+#include <fstream>
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/benchmark.hpp>
+#include <mxx/utils.hpp>
+
+#include <ext/cxx-prettyprint/prettyprint.hpp>
+
+std::string exec_name;
+
+void print_usage() {
+    std::cerr << "Usage: " << exec_name << " -m <msg_size> <output-file>" << std::endl;
+    std::cerr << "where" << std::endl;
+    std::cerr << "    <output-file>     (optional) Filename for the pairwise bandwidth matrix (default: 'p2p_bw.csv')." << std::endl;
+    std::cerr << " -m <msg_size>        (optional) Message size for each process pair in kilo bytes. (default: 131072 (128 MB))." << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    mxx::env e(argc, argv);
+    mxx::comm comm;
+
+    // print out node and rank distribution
+    mxx::print_node_distribution(comm);
+
+    // create shared-mem MPI+MPI hybrid communicator
+    mxx::hybrid_comm hc(comm);
+
+    // assert same number processors per node
+    int proc_per_node = hc.local.size();
+    if (!mxx::all_same(proc_per_node, comm)) {
+        std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    // assert we have an even number of nodes
+    int num_nodes = hc.num_nodes();
+    if (num_nodes > 1 && num_nodes % 2 != 0) {
+        std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl;
+        MPI_Abort(comm, -1);
+    }
+
+    // default args
+    size_t msg_size_kB = 128*1024; // 128 MiB per process default
+    std::string filename = "p2p_bw.csv";
+
+    // parse input arguments
+    exec_name = argv[0];
+    argv++; argc--;
+    if (argc >= 2) {
+        std::string x(argv[0]);
+        if (x == "-m") {
+            msg_size_kB = atoi(argv[1]);
+            argv+=2; argc-=2;
+        }
+        if (x != "-m" || msg_size_kB > 4*1024*1024 || msg_size_kB == 0) {
+            print_usage();
+            MPI_Abort(comm, -1);
+        }
+    }
+    if (argc > 0) {
+        filename = argv[0];
+        argv++; argc--;
+    }
+    if (argc > 0) {
+            print_usage();
+            MPI_Abort(comm, -1);
+    }
+
+    MXX_ASSERT(mxx::all_same(msg_size_kB, comm));
+
+    // perform pairwise bandwidth benchmarking
+    std::vector<double> bw_row = mxx::pairwise_bw_matrix(hc, msg_size_kB*1024);
+
+    // print out benchmarking results and save as file
+    mxx::print_bw_matrix_stats(hc, bw_row);
+    mxx::save_matrix_pernode(hc, filename, bw_row);
+
+    return 0;
+}
diff --git a/src/benchmark_sort.cpp b/src/benchmark_sort.cpp
new file mode 100644
index 0000000..13dccd3
--- /dev/null
+++ b/src/benchmark_sort.cpp
@@ -0,0 +1,54 @@
+#include <string>
+#include <sstream>
+#include <fstream>
+
+#include <mxx/env.hpp>
+#include <mxx/comm.hpp>
+#include <mxx/benchmark.hpp>
+#include <mxx/utils.hpp>
+
+#include <ext/cxx-prettyprint/prettyprint.hpp>
+
+std::string exec_name;
+
+// TODO fix usage
+void print_usage() {
+    std::cerr << "Usage: " << exec_name << " <n> <out-node-filename>" << std::endl;
+    std::cerr << "where" << std::endl;
+    std::cerr << "    <n>                     Number of nodes to vote off." << std::endl;
+    std::cerr << "    <out-node-filename>     Filename for the new nodefile, output by this program." << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    mxx::env e(argc, argv);
+    mxx::comm comm;
+
+    // print out node and rank distribution
+    mxx::print_node_distribution(comm);
+
+    // create shared-mem MPI+MPI hybrid communicator
+    mxx::hybrid_comm hc(comm);
+
+    // create output file for benchmark
+    std::ofstream of;
+    if (hc.global.rank() == 0) {
+        of.open("bm_samplesort.csv");
+        of << "p,nnodes,q,m,n,min,avg,max" << std::endl;
+    }
+
+    size_t mempernode = 16ull << 30;
+
+    // input in growing sizes of 2
+    typedef std::tuple<size_t, size_t> T;
+    for (size_t npn = 1024; npn <= mempernode/sizeof(T)/2; npn <<= 1) {
+        // generate input
+        std::vector<T> a(npn);
+        srand(comm.rank()* 13 + 5);
+        std::generate(a.begin(), a.end(), [](){ return std::make_pair<size_t, size_t>(rand(), rand()); });
+
+    }
+
+    // TODO: sorting benchmark
+
+    return 0;
+}
diff --git a/src/pbs_run.sh b/src/pbs_run.sh
new file mode 100644
index 0000000..1551298
--- /dev/null
+++ b/src/pbs_run.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+#PBS -q swarm
+#PBS -l nodes=16:ppn=28
+#PBS -l walltime=2:00:00
+
+# set up env
+module load gcc/4.9.0
+module load mvapich2/2.2
+#module load openmpi
+
+
+# Change to directory from which qsub command was issued
+cd $PBS_O_WORKDIR
+
+# Old num nodes and PPN
+PPN=$PBS_NUM_PPN
+NUM_NODES=$PBS_NUM_NODES
+NP=$(expr $NUM_NODES \\* $PPN)
+
+echo "Running with np = $NP, ppn = $PPN, nnodes = $NUM_NODES"
+
+mpirun -np $NP -ppn $PPN ./bin/mxx-benchmark-p2p-bw p2p_bw_${NUM_NODES}nodes.csv
+mpirun -np $NP -ppn $PPN ./bin/mxx-benchmark-a2a bm_all2all_${NUM_NODES}nodes.csv
diff --git a/src/vote_off.cpp b/src/vote_off.cpp
index 6d541dd..38afee9 100644
--- a/src/vote_off.cpp
+++ b/src/vote_off.cpp
@@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
 
     bool benchmark_char_align = false;
 
-    std::vector<double> bw_row = mxx::pairwise_bw_matrix(hc);
+    std::vector<double> bw_row = mxx::pairwise_bw_matrix(hc, 32*1024*1024);
     mxx::print_bw_matrix_stats(hc, bw_row);
     bool part = mxx::vote_off(hc, n_vote_off, bw_row);
     if (hc.global.rank() == 0)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c15b232..b741aa4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,9 +26,6 @@ target_link_libraries(mxx-test-sort mxx-gtest-main)
 add_executable(mxx-test-distribution test_distribution.cpp)
 target_link_libraries(mxx-test-distribution mxx-gtest-main)
 
-add_executable(mxx-benchmarks benchmarks.cpp)
-target_link_libraries(mxx-benchmarks mxx-gtest-main)
-
 # Combination of all parallel tests:
 add_executable(mxx-test-all test_collective.cpp test_reductions.cpp test_send.cpp test_sort.cpp test_distribution.cpp)
 target_link_libraries(mxx-test-all mxx-gtest-main)
diff --git a/test/benchmarks.cpp b/test/benchmarks.cpp
deleted file mode 100644
index e71428e..0000000
--- a/test/benchmarks.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2015 Georgia Institute of Technology
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <gtest/gtest.h>
-#include <mxx/comm.hpp>
-#include <mxx/benchmark.hpp>
-
-TEST(mxx, benchmark_p2p) {
-    mxx::benchmark_nodes_bw_p2p();
-}

From 5f5d152f3a9f2f9c3fbe32b1fc2ace9ecd3e2093 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Fri, 23 Mar 2018 11:56:35 -0600
Subject: [PATCH 3/4] modernize cmake files

---
 CMakeLists.txt           | 67 +++++++++++++++++++++++++++++++---------
 cmake/mxxConfig.cmake    |  3 ++
 gtest/CMakeLists.txt     |  7 ++---
 src/CMakeLists.txt       | 10 +++---
 src/benchmark_a2a.cpp    |  2 +-
 src/benchmark_p2p_bw.cpp |  2 +-
 src/benchmark_sort.cpp   |  2 +-
 test/CMakeLists.txt      |  2 +-
 8 files changed, 68 insertions(+), 27 deletions(-)
 create mode 100644 cmake/mxxConfig.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb3459c..222bdb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,7 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.6)
 
 # project settings
-project(mxx)
-
-##### General Compilation Settings
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wuninitialized --std=c++11")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native -funroll-loops")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -g")
+project(mxx LANGUAGES CXX)
 
 # Add these standard paths to the search paths for FIND_LIBRARY
 # to find libraries from these locations first
@@ -25,6 +18,10 @@ if(COMMAND cmake_policy)
     cmake_policy(SET CMP0003 NEW)
 endif()
 
+add_library(mxx INTERFACE)
+target_compile_features(mxx INTERFACE cxx_std_11)
+target_include_directories(mxx INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>)
+
 
 #### MPI
 find_package(MPI REQUIRED)
@@ -32,11 +29,57 @@ if (MPI_FOUND)
     #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_COMPILE_FLAGS}")
     #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_COMPILE_FLAGS}")
     #set(CMAKE_LINK_FLAGS "${CMAKE_LINK_FLAGS} ${MPI_LINK_FLAGS}")
-    include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+    target_link_libraries(mxx INTERFACE ${MPI_CXX_LIBRARIES})
+    target_include_directories(mxx INTERFACE ${MPI_CXX_INCLUDE_DIRS})
 else (MPI_FOUND)
     message(SEND_ERROR "This application cannot compile without MPI")
 endif (MPI_FOUND)
 
+#### cxx-prettyprint
+target_include_directories(mxx INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/ext> $<INSTALL_INTERFACE:include/mxx/ext>)
+
+#### Installation
+install(DIRECTORY include/mxx/ DESTINATION include/mxx)
+install(DIRECTORY ext/ DESTINATION include/mxx FILES_MATCHING PATTERN "*.hpp")
+
+include(CMakePackageConfigHelpers)
+
+install(TARGETS mxx
+        EXPORT mxxTargets
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib
+        RUNTIME DESTINATION bin
+        INCLUDES DESTINATION include
+        )
+
+export(EXPORT mxxTargets
+  FILE "${CMAKE_CURRENT_BINARY_DIR}/mxx/mxxTargets.cmake"
+  NAMESPACE mxx::
+)
+set(ConfigPackageLocation lib/cmake/mxx)
+configure_package_config_file(cmake/mxxConfig.cmake
+  "${CMAKE_CURRENT_BINARY_DIR}/mxx/mxxConfig.cmake"
+  INSTALL_DESTINATION "${ConfigPackageLocation}"
+)
+
+install(EXPORT mxxTargets
+  FILE
+    mxxTargets.cmake
+  NAMESPACE
+    mxx::
+  DESTINATION
+    ${ConfigPackageLocation}
+)
+install(
+  FILES
+    cmake/mxxConfig.cmake
+  DESTINATION
+    ${ConfigPackageLocation}
+  COMPONENT
+    Devel
+)
+
+
 #### Doxygen
 find_package(Doxygen)
 if(DOXYGEN_FOUND)
@@ -82,10 +125,6 @@ endif(FAKE_BIG_MPI)
 # Save libs and executables in the same place
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin CACHE PATH "Output directory for applications" )
 
-include_directories("${PROJECT_SOURCE_DIR}/include")
-include_directories("${PROJECT_SOURCE_DIR}/ext")
-include_directories("${PROJECT_SOURCE_DIR}")
-
 # build tests
 add_subdirectory(gtest)
 add_subdirectory(test)
diff --git a/cmake/mxxConfig.cmake b/cmake/mxxConfig.cmake
new file mode 100644
index 0000000..f4b02c8
--- /dev/null
+++ b/cmake/mxxConfig.cmake
@@ -0,0 +1,3 @@
+find_package(MPI REQUIRED QUIET)
+
+include("${CMAKE_CURRENT_LIST_DIR}/mxxTargets.cmake")
diff --git a/gtest/CMakeLists.txt b/gtest/CMakeLists.txt
index de6b34e..48bbe34 100644
--- a/gtest/CMakeLists.txt
+++ b/gtest/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.6)
 
 # project settings
 project(mxx-gtest)
@@ -8,7 +8,6 @@ project(mxx-gtest)
 #  MPI Google Test:  #
 ######################
 
-include_directories("${PROJECT_SOURCE_DIR}/../")
-
 add_library(mxx-gtest-main mxx_gtest_main.cpp gtest-all.cc)
-target_link_libraries(mxx-gtest-main ${MPI_LIBRARIES} pthread)
+target_link_libraries(mxx-gtest-main PUBLIC mxx ${MPI_LIBRARIES} pthread)
+target_include_directories(mxx-gtest-main PUBLIC "../")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 123d76d..b91c260 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,20 +1,20 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.6)
 
 # project settings
 project(mxx-bm)
 
 add_executable(mxx-bm-vote-off vote_off.cpp)
-target_link_libraries(mxx-bm-vote-off ${MPI_LIBRARIES})
+target_link_libraries(mxx-bm-vote-off mxx ${MPI_LIBRARIES})
 
 # benchmark p2p bandwidth
 add_executable(mxx-benchmark-p2p-bw benchmark_p2p_bw.cpp)
-target_link_libraries(mxx-benchmark-p2p-bw ${MPI_LIBRARIES})
+target_link_libraries(mxx-benchmark-p2p-bw mxx ${MPI_LIBRARIES})
 
 # benchmark all2all
 add_executable(mxx-benchmark-a2a benchmark_a2a.cpp)
-target_link_libraries(mxx-benchmark-a2a ${MPI_LIBRARIES})
+target_link_libraries(mxx-benchmark-a2a mxx ${MPI_LIBRARIES})
 
 # benchmark parallel sorting
 add_executable(mxx-sort-benchmark benchmark_sort.cpp)
 target_compile_definitions(mxx-sort-benchmark PUBLIC -DMXX_SAMPLESORT_TIMER=1)
-target_link_libraries(mxx-sort-benchmark ${MPI_LIBRARIES})
+target_link_libraries(mxx-sort-benchmark mxx ${MPI_LIBRARIES})
diff --git a/src/benchmark_a2a.cpp b/src/benchmark_a2a.cpp
index b9342f9..d9f24f0 100644
--- a/src/benchmark_a2a.cpp
+++ b/src/benchmark_a2a.cpp
@@ -7,7 +7,7 @@
 #include <mxx/benchmark.hpp>
 #include <mxx/utils.hpp>
 
-#include <ext/cxx-prettyprint/prettyprint.hpp>
+#include <cxx-prettyprint/prettyprint.hpp>
 
 std::string exec_name;
 
diff --git a/src/benchmark_p2p_bw.cpp b/src/benchmark_p2p_bw.cpp
index 6339ed5..266e07f 100644
--- a/src/benchmark_p2p_bw.cpp
+++ b/src/benchmark_p2p_bw.cpp
@@ -7,7 +7,7 @@
 #include <mxx/benchmark.hpp>
 #include <mxx/utils.hpp>
 
-#include <ext/cxx-prettyprint/prettyprint.hpp>
+#include <cxx-prettyprint/prettyprint.hpp>
 
 std::string exec_name;
 
diff --git a/src/benchmark_sort.cpp b/src/benchmark_sort.cpp
index 13dccd3..e49ec4c 100644
--- a/src/benchmark_sort.cpp
+++ b/src/benchmark_sort.cpp
@@ -7,7 +7,7 @@
 #include <mxx/benchmark.hpp>
 #include <mxx/utils.hpp>
 
-#include <ext/cxx-prettyprint/prettyprint.hpp>
+#include <cxx-prettyprint/prettyprint.hpp>
 
 std::string exec_name;
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b741aa4..c8a1970 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.6)
 
 # project settings
 project(mxx-test)

From 2017ae926f8c70e5a7839f7b0b31a40979ff127f Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Sat, 24 Mar 2018 20:21:43 -0600
Subject: [PATCH 4/4] try and fix travis mpi issue

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 222bdb1..2c5982b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ if (MPI_FOUND)
     #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_COMPILE_FLAGS}")
     #set(CMAKE_LINK_FLAGS "${CMAKE_LINK_FLAGS} ${MPI_LINK_FLAGS}")
     target_link_libraries(mxx INTERFACE ${MPI_CXX_LIBRARIES})
-    target_include_directories(mxx INTERFACE ${MPI_CXX_INCLUDE_DIRS})
+    target_include_directories(mxx INTERFACE ${MPI_CXX_INCLUDE_PATH})
 else (MPI_FOUND)
     message(SEND_ERROR "This application cannot compile without MPI")
 endif (MPI_FOUND)