diff --git a/include/parallel/communicator.h b/include/parallel/communicator.h
index 99003ba9363..f3f4b27c527 100644
--- a/include/parallel/communicator.h
+++ b/include/parallel/communicator.h
@@ -143,13 +143,22 @@ class Communicator
   const communicator & get() const { return _communicator; }
 
   /**
-   * Get a tag that is unique to this Communicator.
+   * Get a tag that is unique to this Communicator.  A requested tag
+   * value may be provided.  If no request is made then an automatic
+   * unique tag value will be generated; such usage of
+   * get_unique_tag() must be done on every processor in a consistent
+   * order.
    *
    * \note If people are also using magic numbers or copying
-   * communicators around then we can't guarantee the tag is unique to
-   * this MPI_Comm.
+   * raw communicators around then we can't guarantee the tag is
+   * unique to this MPI_Comm.
+   *
+   * \note Leaving \p tagvalue unspecified is recommended in most
+   * cases.  Manually selecting tag values is dangerous, as tag values may be
+   * freed and reselected earlier than expected in asynchronous
+   * communication algorithms.
    */
-  MessageTag get_unique_tag(int tagvalue) const;
+  MessageTag get_unique_tag(int tagvalue = MessageTag::invalid_tag) const;
 
   /**
    * Reference an already-acquired tag, so that we know it will
@@ -191,10 +200,15 @@ class Communicator
   processor_id_type _rank, _size;
   SendMode _send_mode;
 
-  // mutable used_tag_values - not thread-safe, but then Parallel::
-  // isn't thread-safe in general.
+  // mutable used_tag_values and tag_queue - not thread-safe, but then
+  // Parallel:: isn't thread-safe in general.
   mutable std::map<int, unsigned int> used_tag_values;
-  bool          _I_duped_it;
+  mutable int _next_tag;
+
+  int _max_tag;
+
+  // Keep track of duplicate/split operations so we know when to free
+  bool _I_duped_it;
 
   // Communication operations:
 public:
@@ -214,6 +228,11 @@ class Communicator
    */
   void barrier () const;
 
+  /**
+   * Start a barrier that doesn't block
+   */
+  void nonblocking_barrier (Request & req) const;
+
   /**
    * Verify that a local variable has the same value on all processors.
    * Containers must have the same value in every entry.
@@ -413,6 +432,36 @@ class Communicator
                 Request & req,
                 const MessageTag & tag=any_tag) const;
 
+  /**
+   * Nonblocking-receive from one processor with user-defined type.
+   *
+   * Checks to see if a message can be received from the
+   * src_processor_id .  If so, it starts a non-blocking
+   * receive using the passed in request and returns true
+   *
+   * Otherwise - if there is no message to receive it returns false
+   *
+   * Note: The buf does NOT need to properly sized before this call
+   * this will resize the buffer automatically
+   *
+   * If \p T is a container, container-of-containers, etc., then
+   * \p type should be the DataType of the underlying fixed-size
+   * entries in the container(s).
+   *
+   * @param src_processor_id The pid to receive from or "any".
+   * will be set to the actual src being receieved from
+   * @param buf THe buffer to receive into
+   * @param type The intrinsic datatype to receive
+   * @param req The request to use
+   * @param tag The tag to use
+   */
+  template <typename T, typename A>
+  bool possibly_receive (unsigned int & src_processor_id,
+                         std::vector<T,A> & buf,
+                         const DataType & type,
+                         Request & req,
+                         const MessageTag & tag) const;
+
   /**
    * Blocking-send range-of-pointers to one processor.  This
    * function does not send the raw pointers, but rather constructs
diff --git a/include/parallel/parallel_implementation.h b/include/parallel/parallel_implementation.h
index fd148ed2140..b6cca898de7 100644
--- a/include/parallel/parallel_implementation.h
+++ b/include/parallel/parallel_implementation.h
@@ -178,6 +178,9 @@ inline status Communicator::probe (const unsigned int src_processor_id,
 
   status stat;
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   libmesh_call_mpi
     (MPI_Probe (src_processor_id, tag.value(), this->get(), &stat));
 
@@ -197,6 +200,9 @@ inline Status Communicator::packed_range_probe (const unsigned int src_processor
 
   int int_flag;
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   libmesh_call_mpi(MPI_Iprobe(src_processor_id,
                               tag.value(),
                               this->get(),
@@ -218,6 +224,8 @@ inline void Communicator::send (const unsigned int dest_processor_id,
 
   T * dataptr = buf.empty() ? nullptr : const_cast<T *>(buf.data());
 
+  libmesh_assert_less(dest_processor_id, this->size());
+
   libmesh_call_mpi
     (((this->send_mode() == SYNCHRONOUS) ?
       MPI_Ssend : MPI_Send) (dataptr,
@@ -240,6 +248,8 @@ inline void Communicator::send (const unsigned int dest_processor_id,
 
   T * dataptr = buf.empty() ? nullptr : const_cast<T *>(buf.data());
 
+  libmesh_assert_less(dest_processor_id, this->size());
+
   libmesh_call_mpi
     (((this->send_mode() == SYNCHRONOUS) ?
       MPI_Issend : MPI_Isend) (dataptr,
@@ -266,6 +276,8 @@ inline void Communicator::send (const unsigned int dest_processor_id,
 
   T * dataptr = const_cast<T*> (&buf);
 
+  libmesh_assert_less(dest_processor_id, this->size());
+
   libmesh_call_mpi
     (((this->send_mode() == SYNCHRONOUS) ?
       MPI_Ssend : MPI_Send) (dataptr,
@@ -288,6 +300,8 @@ inline void Communicator::send (const unsigned int dest_processor_id,
 
   T * dataptr = const_cast<T*>(&buf);
 
+  libmesh_assert_less(dest_processor_id, this->size());
+
   libmesh_call_mpi
     (((this->send_mode() == SYNCHRONOUS) ?
       MPI_Issend : MPI_Isend) (dataptr,
@@ -417,6 +431,8 @@ inline void Communicator::send (const unsigned int dest_processor_id,
 {
   LOG_SCOPE("send()", "Parallel");
 
+  libmesh_assert_less(dest_processor_id, this->size());
+
   libmesh_call_mpi
     (((this->send_mode() == SYNCHRONOUS) ?
       MPI_Issend : MPI_Isend) (buf.empty() ? nullptr : const_cast<T*>(buf.data()),
@@ -778,6 +794,9 @@ inline Status Communicator::receive (const unsigned int src_processor_id,
   // datatype so we can later query the size
   Status stat(this->probe(src_processor_id, tag), StandardType<T>(&buf));
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   libmesh_call_mpi
     (MPI_Recv (&buf, 1, StandardType<T>(&buf), src_processor_id,
                tag.value(), this->get(), stat.get()));
@@ -795,6 +814,9 @@ inline void Communicator::receive (const unsigned int src_processor_id,
 {
   LOG_SCOPE("receive()", "Parallel");
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   libmesh_call_mpi
     (MPI_Irecv (&buf, 1, StandardType<T>(&buf), src_processor_id,
                 tag.value(), this->get(), req.get()));
@@ -929,6 +951,9 @@ inline Status Communicator::receive (const unsigned int src_processor_id,
 
   buf.resize(stat.size());
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   // Use stat.source() and stat.tag() in the receive - if
   // src_processor_id is or tag is "any" then we want to be sure we
   // try to receive the same message we just probed.
@@ -953,6 +978,9 @@ inline void Communicator::receive (const unsigned int src_processor_id,
 {
   LOG_SCOPE("receive()", "Parallel");
 
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
   libmesh_call_mpi
     (MPI_Irecv (buf.empty() ? nullptr : buf.data(),
                 cast_int<int>(buf.size()), type, src_processor_id,
@@ -1191,6 +1219,10 @@ inline void Communicator::send_receive(const unsigned int dest_processor_id,
       return;
     }
 
+  libmesh_assert_less(dest_processor_id, this->size());
+  libmesh_assert(source_processor_id < this->size() ||
+                 source_processor_id == any_source);
+
   // MPI_STATUS_IGNORE is from MPI-2; using it with some versions of
   // MPICH may cause a crash:
   // https://bugzilla.mcs.anl.gov/globus/show_bug.cgi?id=1798
@@ -1448,6 +1480,8 @@ inline void Communicator::broadcast (bool & data, const unsigned int root_id) co
   // MPI::BOOL available
   char char_data = data;
 
+  libmesh_assert_less(root_id, this->size());
+
   // Spread data to remote processors.
   libmesh_call_mpi
     (MPI_Bcast (&char_data, 1, StandardType<char>(&char_data),
@@ -1515,6 +1549,8 @@ inline void Communicator::broadcast (std::vector<T,A> & data,
   // Pass nullptr if our vector is empty.
   T * data_ptr = data.empty() ? nullptr : data.data();
 
+  libmesh_assert_less(root_id, this->size());
+
   libmesh_call_mpi
     (MPI_Bcast (data_ptr, cast_int<int>(data.size()),
                 StandardType<T>(data_ptr), root_id, this->get()));
@@ -2618,6 +2654,8 @@ inline void Communicator::gather(const unsigned int root_id,
 
       StandardType<T> send_type(&sendval);
 
+      libmesh_assert_less(root_id, this->size());
+
       libmesh_call_mpi
         (MPI_Gather(const_cast<T*>(&sendval), 1, send_type,
                     recv.empty() ? nullptr : recv.data(), 1, send_type,
@@ -2672,6 +2710,8 @@ inline void Communicator::gather(const unsigned int root_id,
   if (root_id == this->rank())
     r.resize(globalsize);
 
+  libmesh_assert_less(root_id, this->size());
+
   // and get the data from the remote processors
   libmesh_call_mpi
     (MPI_Gatherv (r_src.empty() ? nullptr : r_src.data(), mysize,
@@ -2723,6 +2763,8 @@ inline void Communicator::gather(const unsigned int root_id,
       if (this->rank() == root_id)
         r.resize(globalsize, 0);
 
+      libmesh_assert_less(root_id, this->size());
+
       // and get the data from the remote processors.
       libmesh_call_mpi
         (MPI_Gatherv (const_cast<T*>(sendval.data()),
@@ -2942,6 +2984,8 @@ void Communicator::scatter(const std::vector<T,A> & data,
   T * data_ptr = const_cast<T*>(data.empty() ? nullptr : data.data());
   libmesh_ignore(data_ptr); // unused ifndef LIBMESH_HAVE_MPI
 
+  libmesh_assert_less(root_id, this->size());
+
   libmesh_call_mpi
     (MPI_Scatter (data_ptr, 1, StandardType<T>(data_ptr),
                   &recv, 1, StandardType<T>(&recv), root_id, this->get()));
@@ -2980,6 +3024,8 @@ void Communicator::scatter(const std::vector<T,A> & data,
   T * recv_ptr = recv.empty() ? nullptr : recv.data();
   libmesh_ignore(data_ptr, recv_ptr); // unused ifndef LIBMESH_HAVE_MPI
 
+  libmesh_assert_less(root_id, this->size());
+
   libmesh_call_mpi
     (MPI_Scatter (data_ptr, recv_buffer_size, StandardType<T>(data_ptr),
                   recv_ptr, recv_buffer_size, StandardType<T>(recv_ptr), root_id, this->get()));
@@ -3032,6 +3078,8 @@ void Communicator::scatter(const std::vector<T,A1> & data,
   T * recv_ptr = recv.empty() ? nullptr : recv.data();
   libmesh_ignore(data_ptr, count_ptr, recv_ptr); // unused ifndef LIBMESH_HAVE_MPI
 
+  libmesh_assert_less(root_id, this->size());
+
   // Scatter the non-uniform chunks
   libmesh_call_mpi
     (MPI_Scatterv (data_ptr, count_ptr, displacements.data(), StandardType<T>(data_ptr),
@@ -3206,6 +3254,53 @@ inline void Communicator::allgather_packed_range(Context * context,
 }
 
 
+
+template <typename T, typename A>
+inline bool Communicator::possibly_receive (unsigned int & src_processor_id,
+                                            std::vector<T,A> & buf,
+                                            const DataType & type,
+                                            Request & req,
+                                            const MessageTag & tag) const
+{
+  LOG_SCOPE("possibly_receive()", "Parallel");
+
+  Status stat(type);
+
+  int int_flag = 0;
+
+  libmesh_assert(src_processor_id < this->size() ||
+                 src_processor_id == any_source);
+
+  libmesh_call_mpi(MPI_Iprobe(src_processor_id,
+                              tag.value(),
+                              this->get(),
+                              &int_flag,
+                              stat.get()));
+
+  if (int_flag)
+  {
+    buf.resize(stat.size());
+
+    src_processor_id = stat.source();
+
+    libmesh_call_mpi
+      (MPI_Irecv (buf.data(),
+                  cast_int<int>(buf.size()),
+                  type,
+                  src_processor_id,
+                  tag.value(),
+                  this->get(),
+                  req.get()));
+
+    // The MessageTag should stay registered for the Request lifetime
+    req.add_post_wait_work
+      (new Parallel::PostWaitDereferenceTag(tag));
+  }
+
+  return int_flag;
+}
+
+
 } // namespace Parallel
 
 } // namespace libMesh
diff --git a/include/parallel/parallel_sync.h b/include/parallel/parallel_sync.h
index 451aae88205..8cb0d50366d 100644
--- a/include/parallel/parallel_sync.h
+++ b/include/parallel/parallel_sync.h
@@ -27,6 +27,7 @@
 #include <map>
 #include <type_traits>
 #include <vector>
+#include <list>
 
 
 namespace libMesh
@@ -53,42 +54,14 @@ namespace Parallel {
  * All receives and actions are completed before this function
  * returns.
  *
- * Not all sends may have yet completed.  The supplied container of
- * Request objects, \p req, has more requests inserted, one for each
- * of the data sends.  These requests must be waited on before the \p
- * data map is deleted.
+ * Note: it is very important that the message tag be completely
+ * unique to each invocation
  */
 template <typename MapToVectors,
-          typename RequestContainer,
           typename ActionFunctor>
 void push_parallel_vector_data(const Communicator & comm,
                                const MapToVectors & data,
-                               RequestContainer & reqs,
-                               ActionFunctor & act_on_data);
-
-
-
-/**
- * Send and receive and act on vectors of data.
- *
- * The \p data map is indexed by processor ids as keys, and for each
- * processor id in the map there should be a vector of data to send.
- *
- * Data which is received from other processors will be operated on by
- * act_on_data(processor_id_type pid, const std::vector<datum> & data);
- *
- * No guarantee about operation ordering is made - this function will
- * attempt to act on data in the order in which it is received.
- *
- * All communication and actions are complete when this function
- * returns.
- */
-template <typename MapToVectors,
-          typename ActionFunctor>
-void push_parallel_vector_data(const Communicator & comm,
-                               const MapToVectors & data,
-                               ActionFunctor & act_on_data);
-
+                               const ActionFunctor & act_on_data);
 
 /**
  * Send query vectors, receive and answer them with vectors of data,
@@ -116,50 +89,6 @@ void push_parallel_vector_data(const Communicator & comm,
  *
  * All receives and actions are completed before this function
  * returns.
- *
- * Not all sends may have yet completed.  The supplied container of
- * Request objects, \p req, has more requests inserted, one for each
- * of the data sends.  These requests must be waited on before the \p
- * data map is deleted.
- */
-template <typename datum,
-          typename MapToVectors,
-          typename RequestContainer,
-          typename GatherFunctor,
-          typename ActionFunctor>
-void pull_parallel_vector_data(const Communicator & comm,
-                               const MapToVectors & queries,
-                               RequestContainer & reqs,
-                               GatherFunctor & gather_data,
-                               ActionFunctor & act_on_data,
-                               const datum * example);
-
-/**
- * Send query vectors, receive and answer them with vectors of data,
- * then act on those answers.
- *
- * The \p data map is indexed by processor ids as keys, and for each
- * processor id in the map there should be a vector of query ids to send.
- *
- * Query data which is received from other processors will be operated
- * on by
- * gather_data(processor_id_type pid, const std::vector<id> & ids,
- *             std::vector<datum> & data)
- *
- * Answer data which is received from other processors will be operated on by
- * act_on_data(processor_id_type pid, const std::vector<id> & ids,
- *             const std::vector<datum> & data);
- *
- * The example pointer may be null; it merely needs to be of the
- * correct type.  It's just here because function overloading in C++
- * is easy, whereas SFINAE is hard and partial template specialization
- * of functions is impossible.
- *
- * No guarantee about operation ordering is made - this function will
- * attempt to act on data in the order in which it is received.
- *
- * All communication and actions are complete when this function
- * returns.
  */
 template <typename datum,
           typename MapToVectors,
@@ -175,24 +104,6 @@ void pull_parallel_vector_data(const Communicator & comm,
 // Parallel function overloads
 //
 
-/*
- * A specialization for types that are harder to non-blocking receive.
- */
-template <template <typename, typename, typename ...> class MapType,
-          typename KeyType,
-          typename ValueType,
-          typename A1,
-          typename A2,
-          typename ... ExtraTypes,
-          typename RequestContainer,
-          typename ActionFunctor>
-void push_parallel_vector_data(const Communicator & comm,
-                               const MapType<processor_id_type, std::vector<std::vector<ValueType,A1>,A2>, ExtraTypes...> & data,
-                               RequestContainer & reqs,
-                               ActionFunctor & act_on_data);
-
-
-
 /*
  * A specialization for types that are harder to non-blocking receive.
  */
@@ -205,7 +116,7 @@ template <template <typename, typename, typename ...> class MapType,
           typename ActionFunctor>
 void push_parallel_vector_data(const Communicator & comm,
                                const MapType<processor_id_type, std::vector<std::vector<ValueType,A1>,A2>, ExtraTypes...> & data,
-                               ActionFunctor & act_on_data);
+                               const ActionFunctor & act_on_data);
 
 /*
  * A specialization for types that are harder to non-blocking receive.
@@ -213,12 +124,10 @@ void push_parallel_vector_data(const Communicator & comm,
 template <typename datum,
           typename A,
           typename MapToVectors,
-          typename RequestContainer,
           typename GatherFunctor,
           typename ActionFunctor>
 void pull_parallel_vector_data(const Communicator & comm,
                                const MapToVectors & queries,
-                               RequestContainer & reqs,
                                GatherFunctor & gather_data,
                                ActionFunctor & act_on_data,
                                const std::vector<datum,A> * example);
@@ -228,45 +137,21 @@ void pull_parallel_vector_data(const Communicator & comm,
 
 
 
-
 //------------------------------------------------------------------------
 // Parallel members
 //
 
 template <typename MapToVectors,
-          typename RequestContainer,
           typename ActionFunctor>
 void push_parallel_vector_data(const Communicator & comm,
                                const MapToVectors & data,
-                               RequestContainer & reqs,
-                               ActionFunctor & act_on_data)
+                               const ActionFunctor & act_on_data)
 {
   // This function must be run on all processors at once
   libmesh_parallel_only(comm);
 
-  processor_id_type num_procs = comm.size();
-
-  // Size of vectors to send to each procesor
-  std::vector<std::size_t> will_send_to(num_procs, 0);
-  processor_id_type num_sends = 0;
-  for (auto & datapair : data)
-    {
-      // Don't try to send anywhere that doesn't exist
-      libmesh_assert_less(datapair.first, num_procs);
-
-      // Don't give us empty vectors to send
-      libmesh_assert_greater(datapair.second.size(), 0);
-
-      will_send_to[datapair.first] = datapair.second.size();
-      num_sends++;
-    }
-
-  // Tell everyone about where everyone will send to
-  comm.alltoall(will_send_to);
-
-  // will_send_to now represents who we'll receive from
-  // give it a good name
-  auto & will_receive_from = will_send_to;
+  // This function implements the "NBX" algorithm from
+  // https://htor.inf.ethz.ch/publications/img/hoefler-dsde-protocols.pdf
 
   // This function only works for "flat" data that we can pre-size
   // receive buffers for: a map to vectors-of-standard-types, not e.g.
@@ -284,19 +169,34 @@ void push_parallel_vector_data(const Communicator & comm,
   // complete normally." so we're cool.
   typedef decltype(data.begin()->second.front()) ref_type;
   typedef typename std::remove_reference<ref_type>::type nonref_type;
-  StandardType<typename std::remove_const<nonref_type>::type> datatype;
+  typedef typename std::remove_const<nonref_type>::type nonconst_nonref_type;
+  StandardType<nonconst_nonref_type> datatype;
 
   // We'll grab a tag so we can overlap request sends and receives
   // without confusing one for the other
-  MessageTag tag = comm.get_unique_tag(1225);
+  auto tag = comm.get_unique_tag();
 
   MapToVectors received_data;
 
-  // Post all of the sends, non-blocking
+  // Post all of the sends, non-blocking and synchronous
+
+  // Save off the old send_mode so we can restore it after this
+  auto old_send_mode = comm.send_mode();
+
+  // Set the sending to synchronous - this is so that we can know when
+  // the sends are complete
+  const_cast<Communicator &>(comm).send_mode(Communicator::SYNCHRONOUS);
+
+  // The send requests
+  std::list<Request> reqs;
+
+  processor_id_type num_procs = comm.size();
+
   for (auto & datapair : data)
     {
-      processor_id_type destid = datapair.first;
-      libmesh_assert_less(destid, num_procs);
+      // In the case of data partitioned into more processors than we
+      // have ranks, we "wrap around"
+      processor_id_type destid = datapair.first % num_procs;
       auto & datum = datapair.second;
 
       // Just act on data if the user requested a send-to-self
@@ -305,95 +205,164 @@ void push_parallel_vector_data(const Communicator & comm,
       else
         {
           Request sendreq;
-          comm.send(destid, datum, datatype, sendreq, tag);
-          reqs.insert(reqs.end(), sendreq);
+          comm.send(destid, datum,/* datatype,*/ sendreq, tag);
+          reqs.push_back(sendreq);
         }
     }
 
-  // Post all of the receives, non-blocking
-  std::vector<Request> receive_reqs;
-  std::vector<processor_id_type> receive_procids;
-  for (processor_id_type proc_id = 0; proc_id < num_procs; proc_id++)
-    if (will_receive_from[proc_id] && proc_id != comm.rank())
-      {
-        Request req;
-        auto & incoming_data = received_data[proc_id];
-        incoming_data.resize(will_receive_from[proc_id]);
-        comm.receive(proc_id, incoming_data, datatype, req, tag);
-        receive_reqs.push_back(req);
-        receive_procids.push_back(proc_id);
-      }
-
-  while(receive_reqs.size())
+  bool sends_complete = reqs.empty();
+  bool started_barrier = false;
+  Request barrier_request;
+
+  // Receive
+
+  // The pair of src_pid and requests
+  std::list<std::pair<unsigned int, std::shared_ptr<Request>>> receive_reqs;
+  auto current_request = std::make_shared<Request>();
+
+  std::multimap<processor_id_type, std::shared_ptr<std::vector<nonconst_nonref_type>>> incoming_data;
+  auto current_incoming_data = std::make_shared<std::vector<nonconst_nonref_type>>();
+
+  unsigned int current_src_proc = 0;
+
+  // Keep looking for receives
+  while (true)
+  {
+    // Look for data from anywhere
+    current_src_proc = Parallel::any_source;
+
+    // Check if there is a message and start receiving it
+    if (comm.possibly_receive(current_src_proc, *current_incoming_data, datatype, *current_request, tag))
     {
-      std::size_t completed = waitany(receive_reqs);
-      processor_id_type proc_id = receive_procids[completed];
-      receive_reqs.erase(receive_reqs.begin() + completed);
-      receive_procids.erase(receive_procids.begin() + completed);
+      receive_reqs.emplace_back(current_src_proc, current_request);
+      current_request = std::make_shared<Request>();
 
-      act_on_data(proc_id, received_data[proc_id]);
-      received_data.erase(proc_id);
+      // current_src_proc will now hold the src pid for this receive
+      incoming_data.emplace(current_src_proc, current_incoming_data);
+      current_incoming_data = std::make_shared<std::vector<nonconst_nonref_type>>();
     }
-}
+
+    // Clean up outstanding receive requests
+    receive_reqs.remove_if([&act_on_data, &incoming_data](std::pair<unsigned int, std::shared_ptr<Request>> & pid_req_pair)
+                           {
+                             auto & pid = pid_req_pair.first;
+                             auto & req = pid_req_pair.second;
+
+                             // If it's finished - let's act on it
+                             if (req->test())
+                             {
+                               // Do any post-wait work
+                               req->wait();
+
+                               auto it = incoming_data.find(pid);
+                               libmesh_assert(it != incoming_data.end());
+
+                               act_on_data(pid, *it->second);
+
+                               // Don't need this data anymore
+                               incoming_data.erase(it);
+
+                               // This removes it from the list
+                               return true;
+                             }
+
+                             // Not finished yet
+                             return false;
+                           });
+
+    reqs.remove_if([](Request & req)
+                   {
+                     if (req.test())
+                     {
+                       // Do Post-Wait work
+                       req.wait();
+
+                       return true;
+                     }
+
+                     // Not finished yet
+                     return false;
+                   });
 
 
+    // See if all of the sends are finished
+    if (reqs.empty())
+      sends_complete = true;
+
+    // If they've all completed then we can start the barrier
+    if (sends_complete && !started_barrier)
+    {
+      started_barrier = true;
+      comm.nonblocking_barrier(barrier_request);
+    }
+
+    // Must fully receive everything before being allowed to move on!
+    if (receive_reqs.empty())
+      // See if all proessors have finished all sends (i.e. _done_!)
+      if (started_barrier)
+        if (barrier_request.test())
+          break; // Done!
+  }
+
+  // Reset the send mode
+  const_cast<Communicator &>(comm).send_mode(old_send_mode);
+}
+
 
 template <template <typename, typename, typename ...> class MapType,
           typename ValueType,
           typename A1,
           typename A2,
           typename ... ExtraTypes,
-          typename RequestContainer,
           typename ActionFunctor>
 void push_parallel_vector_data(const Communicator & comm,
                                const MapType<processor_id_type, std::vector<std::vector<ValueType,A1>,A2>, ExtraTypes...> & data,
-                               RequestContainer & reqs,
-                               ActionFunctor & act_on_data)
+                               const ActionFunctor & act_on_data)
 {
   // This function must be run on all processors at once
   libmesh_parallel_only(comm);
 
   processor_id_type num_procs = comm.size();
 
-  // Size of vectors to send to each procesor
+  // Number of vectors to send to each procesor
   std::vector<std::size_t> will_send_to(num_procs, 0);
-  processor_id_type num_sends = 0;
   for (auto & datapair : data)
     {
-      // Don't try to send anywhere that doesn't exist
-      libmesh_assert_less(datapair.first, num_procs);
+      // In the case of data partitioned into more processors than we
+      // have ranks, we "wrap around"
+      processor_id_type destid = datapair.first % num_procs;
 
       // Don't give us empty vectors to send
       libmesh_assert_greater(datapair.second.size(), 0);
 
-      will_send_to[datapair.first] = datapair.second.size();
-      num_sends++;
+      will_send_to[destid]++;
     }
 
   // Tell everyone about where everyone will send to
   comm.alltoall(will_send_to);
 
-  // will_send_to now represents who we'll receive from
-  // give it a good name
+  // will_send_to now represents how many vectors we'll receive from
+  // each processor; give it a better name.
   auto & will_receive_from = will_send_to;
 
   processor_id_type n_receives = 0;
   for (processor_id_type proc_id = 0; proc_id < num_procs; proc_id++)
-    if (will_receive_from[proc_id])
-      n_receives++;
+    n_receives += will_receive_from[proc_id];
 
   // We'll construct a datatype once for repeated use
   StandardType<ValueType> datatype;
 
   // We'll grab a tag so we can overlap request sends and receives
   // without confusing one for the other
-  MessageTag tag = comm.get_unique_tag(1225);
+  MessageTag tag = comm.get_unique_tag();
+
+  // The send requests
+  std::list<Request> reqs;
 
   // Post all of the sends, non-blocking
   for (auto & datapair : data)
     {
-      processor_id_type destid = datapair.first;
-      libmesh_assert_less(destid, num_procs);
+      processor_id_type destid = datapair.first % num_procs;
       auto & datum = datapair.second;
 
       // Just act on data if the user requested a send-to-self
@@ -406,7 +375,7 @@ void push_parallel_vector_data(const Communicator & comm,
         {
           Request sendreq;
           comm.send(destid, datum, datatype, sendreq, tag);
-          reqs.insert(reqs.end(), sendreq);
+          reqs.push_back(sendreq);
         }
     }
 
@@ -426,197 +395,137 @@ void push_parallel_vector_data(const Communicator & comm,
       comm.receive(proc_id, received_data, datatype, tag);
       act_on_data(proc_id, received_data);
     }
-}
 
-
-
-template <typename MapToVectors,
-          typename ActionFunctor>
-void push_parallel_vector_data(const Communicator & comm,
-                               const MapToVectors & data,
-                               ActionFunctor & act_on_data)
-{
-  std::vector<Request> requests;
-
-  push_parallel_vector_data(comm, data, requests, act_on_data);
-
-  wait(requests);
+  // Wat on all the sends to complete
+  for (auto & req : reqs)
+    req.wait();
 }
 
 
-
-template <template <typename, typename, typename ...> class MapType,
-          typename ValueType,
-          typename A1,
-          typename A2,
-          typename ... ExtraTypes,
-          typename ActionFunctor>
-void push_parallel_vector_data(const Communicator & comm,
-                               const MapType<processor_id_type, std::vector<std::vector<ValueType,A1>,A2>, ExtraTypes...> & data,
-                               ActionFunctor & act_on_data)
-{
-  std::vector<Request> requests;
-
-  push_parallel_vector_data(comm, data, requests, act_on_data);
-
-  wait(requests);
-}
-
-
-
 template <typename datum,
           typename MapToVectors,
-          typename RequestContainer,
           typename GatherFunctor,
           typename ActionFunctor>
 void pull_parallel_vector_data(const Communicator & comm,
                                const MapToVectors & queries,
-                               RequestContainer & reqs,
                                GatherFunctor & gather_data,
                                ActionFunctor & act_on_data,
                                const datum *)
 {
   typedef typename MapToVectors::mapped_type query_type;
 
-  std::map<processor_id_type, std::vector<datum> >
+  std::multimap<processor_id_type, std::vector<datum> >
     response_data, received_data;
-  std::vector<Request> response_reqs;
 
   StandardType<datum> datatype;
 
-  // We'll grab a tag so we can overlap request sends and receives
-  // without confusing one for the other
-  MessageTag tag = comm.get_unique_tag(105);
+#ifndef NDEBUG
+  processor_id_type max_pid = 0;
+  for (auto p : queries)
+    max_pid = std::max(max_pid, p.first);
+#endif
 
   auto gather_functor =
-    [&comm, &gather_data, &response_data, &response_reqs, &datatype, &tag]
+    [&gather_data, &response_data]
     (processor_id_type pid, query_type query)
     {
-      gather_data(pid, query, response_data[pid]);
-      libmesh_assert_equal_to(query.size(), response_data[pid].size());
-
-      // Just act on data later if the user requested a send-to-self
-      if (pid != comm.rank())
-        {
-          Request sendreq;
-          comm.send(pid, response_data[pid], datatype, sendreq, tag);
-          response_reqs.push_back(sendreq);
-        }
+      auto new_data_it =
+        response_data.emplace(pid, std::vector<datum>());
+      gather_data(pid, query, new_data_it->second);
+      libmesh_assert_equal_to(query.size(), new_data_it->second.size());
     };
 
-  push_parallel_vector_data (comm, queries, reqs, gather_functor);
+  push_parallel_vector_data (comm, queries, gather_functor);
 
-  // Every outgoing query should now have an incoming response.
-  // Post all of the receives, non-blocking
-  std::vector<Request> receive_reqs;
-  std::vector<processor_id_type> receive_procids;
-  for (auto & querypair : queries)
+  std::map<processor_id_type, unsigned int> responses_acted_on;
+
+  const processor_id_type num_procs = comm.size();
+
+  auto action_functor =
+    [&act_on_data, &queries, &responses_acted_on,
+#ifndef NDEBUG
+     max_pid,
+#endif
+     num_procs
+    ]
+    (processor_id_type pid, const std::vector<datum> & data)
     {
-      processor_id_type proc_id = querypair.first;
-      libmesh_assert_less(proc_id, comm.size());
+      auto q_pid_its = queries.equal_range(pid);
+      auto query_it = q_pid_its.first;
+      libmesh_assert(query_it != q_pid_its.second);
 
-      if (proc_id == comm.rank())
-        {
-          libmesh_assert(queries.count(proc_id));
-          libmesh_assert_equal_to(queries.at(proc_id).size(),
-                                  response_data.at(proc_id).size());
-          act_on_data(proc_id, queries.at(proc_id), response_data.at(proc_id));
-        }
-      else
+      // We rely on responses coming in the same order as queries
+      const unsigned int nth_query = responses_acted_on[pid]++;
+      for (unsigned int i=0; i != nth_query; ++i)
         {
-          auto & querydata = querypair.second;
-          Request req;
-          auto & incoming_data = received_data[proc_id];
-          incoming_data.resize(querydata.size());
-          comm.receive(proc_id, incoming_data, datatype, req, tag);
-          receive_reqs.push_back(req);
-          receive_procids.push_back(proc_id);
+          query_it++;
+          if (query_it == q_pid_its.second)
+            {
+              do
+                {
+                  pid += num_procs;
+                  q_pid_its = queries.equal_range(pid);
+                  libmesh_assert_less_equal(pid, max_pid);
+                } while (q_pid_its.first == q_pid_its.second);
+              query_it = q_pid_its.first;
+            }
         }
-    }
-
-  while(receive_reqs.size())
-    {
-      std::size_t completed = waitany(receive_reqs);
-      processor_id_type proc_id = receive_procids[completed];
-      receive_reqs.erase(receive_reqs.begin() + completed);
-      receive_procids.erase(receive_procids.begin() + completed);
 
-      libmesh_assert(queries.count(proc_id));
-      libmesh_assert_equal_to(queries.at(proc_id).size(),
-                              received_data[proc_id].size());
-      act_on_data(proc_id, queries.at(proc_id), received_data[proc_id]);
-      received_data.erase(proc_id);
-    }
+      act_on_data(pid, query_it->second, data);
+    };
 
-  wait(response_reqs);
+  push_parallel_vector_data (comm, response_data, action_functor);
 }
 
 
-template <typename datum,
-          typename MapToVectors,
-          typename GatherFunctor,
-          typename ActionFunctor>
-void pull_parallel_vector_data(const Communicator & comm,
-                               const MapToVectors & queries,
-                               GatherFunctor & gather_data,
-                               ActionFunctor & act_on_data,
-                               const datum * example)
-{
-  std::vector<Request> requests;
-
-  pull_parallel_vector_data(comm, queries, requests, gather_data,
-                            act_on_data, example);
-
-  wait(requests);
-}
 
 
 template <typename datum,
           typename A,
           typename MapToVectors,
-          typename RequestContainer,
           typename GatherFunctor,
           typename ActionFunctor>
 void pull_parallel_vector_data(const Communicator & comm,
                                const MapToVectors & queries,
-                               RequestContainer & reqs,
                                GatherFunctor & gather_data,
                                ActionFunctor & act_on_data,
                                const std::vector<datum,A> *)
 {
   typedef typename MapToVectors::mapped_type query_type;
 
-  std::map<processor_id_type, std::vector<std::vector<datum,A>>>
-    response_data;
+  // First index: order of creation, irrelevant
+  std::vector<std::vector<std::vector<datum,A>>> response_data;
   std::vector<Request> response_reqs;
 
   // We'll grab a tag so we can overlap request sends and receives
   // without confusing one for the other
-  MessageTag tag = comm.get_unique_tag(105);
+  MessageTag tag = comm.get_unique_tag();
 
   auto gather_functor =
     [&comm, &gather_data, &act_on_data,
      &response_data, &response_reqs, &tag]
     (processor_id_type pid, query_type query)
     {
-      gather_data(pid, query, response_data[pid]);
+      std::vector<std::vector<datum,A>> response;
+      gather_data(pid, query, response);
       libmesh_assert_equal_to(query.size(),
-                              response_data[pid].size());
+                              response.size());
 
       // Just act on data if the user requested a send-to-self
       if (pid == comm.rank())
         {
-          act_on_data(pid, query, response_data[pid]);
+          act_on_data(pid, query, response);
         }
       else
         {
           Request sendreq;
-          comm.send(pid, response_data[pid], sendreq, tag);
+          comm.send(pid, response, sendreq, tag);
           response_reqs.push_back(sendreq);
+          response_data.push_back(std::move(response));
         }
     };
 
-  push_parallel_vector_data (comm, queries, reqs, gather_functor);
+  push_parallel_vector_data (comm, queries, gather_functor);
 
   // Every outgoing query should now have an incoming response.
   //
diff --git a/src/base/dof_map_constraints.C b/src/base/dof_map_constraints.C
index ac8d1cf39da..aab9d4141fd 100644
--- a/src/base/dof_map_constraints.C
+++ b/src/base/dof_map_constraints.C
@@ -3060,7 +3060,7 @@ void DofMap::allgather_recursive_constraints(MeshBase & mesh)
 
   // We may be receiving packed_range sends out of order with
   // parallel_sync tags, so make sure they're received correctly.
-  Parallel::MessageTag range_tag = this->comm().get_unique_tag(14142);
+  Parallel::MessageTag range_tag = this->comm().get_unique_tag();
 
   while (unexpanded_set_nonempty)
     {
@@ -3541,7 +3541,7 @@ void DofMap::scatter_constraints(MeshBase & mesh)
 
   // We may be receiving packed_range sends out of order with
   // parallel_sync tags, so make sure they're received correctly.
-  Parallel::MessageTag range_tag = this->comm().get_unique_tag(1414);
+  Parallel::MessageTag range_tag = this->comm().get_unique_tag();
 
 #ifdef LIBMESH_ENABLE_NODE_CONSTRAINTS
   std::map<processor_id_type, std::set<dof_id_type>> pushed_node_ids;
@@ -3642,7 +3642,6 @@ void DofMap::scatter_constraints(MeshBase & mesh)
       pushed_keys_vals_to_me[pid] = data;
     };
 
-  // Trade pushed dof constraint rows
   Parallel::push_parallel_vector_data
     (this->comm(), pushed_ids_rhss, ids_rhss_action_functor);
   Parallel::push_parallel_vector_data
diff --git a/src/base/sparsity_pattern.C b/src/base/sparsity_pattern.C
index 93636c95af4..8c6455992e5 100644
--- a/src/base/sparsity_pattern.C
+++ b/src/base/sparsity_pattern.C
@@ -449,8 +449,8 @@ void Build::parallel_sync ()
   auto pid = comm.rank();
   auto num_procs = comm.size();
 
-  auto dof_tag = comm.get_unique_tag(998);
-  auto row_tag = comm.get_unique_tag(9998);
+  auto dof_tag = comm.get_unique_tag();
+  auto row_tag = comm.get_unique_tag();
 
   const auto n_global_dofs   = dof_map.n_dofs();
   const auto n_dofs_on_proc  = dof_map.n_dofs_on_processor(pid);
diff --git a/src/mesh/boundary_info.C b/src/mesh/boundary_info.C
index e4cefbccc5b..e48d02dd1ba 100644
--- a/src/mesh/boundary_info.C
+++ b/src/mesh/boundary_info.C
@@ -1836,9 +1836,9 @@ BoundaryInfo::build_node_list_from_side_list()
   // Otherwise we need to push ghost node bcids to their owners, then
   // pull ghost node bcids from their owners.
   Parallel::MessageTag
-    node_pushes_tag = this->comm().get_unique_tag(31337),
-    node_pulls_tag = this->comm().get_unique_tag(31338),
-    node_responses_tag = this->comm().get_unique_tag(31339);
+    node_pushes_tag = this->comm().get_unique_tag(),
+    node_pulls_tag = this->comm().get_unique_tag(),
+    node_responses_tag = this->comm().get_unique_tag();
 
   std::vector<Parallel::Request> node_push_requests(n_proc-1);
 
diff --git a/src/mesh/mesh_communication.C b/src/mesh/mesh_communication.C
index aa280f35d77..22d2349ba20 100644
--- a/src/mesh/mesh_communication.C
+++ b/src/mesh/mesh_communication.C
@@ -588,10 +588,9 @@ void MeshCommunication::gather_neighboring_elements (DistributedMesh & mesh) con
   mesh.find_neighbors (/* reset_remote_elements = */ true,
                        /* reset_current_list    = */ true);
 
-  // Get a unique message tag to use in communications; we'll default
-  // to some numbers around pi*10000
+  // Get a unique message tag to use in communications
   Parallel::MessageTag
-    element_neighbors_tag = mesh.comm().get_unique_tag(31416);
+    element_neighbors_tag = mesh.comm().get_unique_tag();
 
   // Now any element with a nullptr neighbor either
   // (i) lives on the physical domain boundary, or
diff --git a/src/mesh/mesh_refinement.C b/src/mesh/mesh_refinement.C
index 69b4448ed44..a5c86d1d46b 100644
--- a/src/mesh/mesh_refinement.C
+++ b/src/mesh/mesh_refinement.C
@@ -1100,7 +1100,7 @@ bool MeshRefinement::make_coarsening_compatible()
       parallel_object_only();
 
       Parallel::MessageTag
-        uncoarsenable_tag = this->comm().get_unique_tag(2718);
+        uncoarsenable_tag = this->comm().get_unique_tag();
       std::vector<Parallel::Request> uncoarsenable_push_requests(n_proc-1);
 
       for (processor_id_type p = 0; p != n_proc; ++p)
@@ -1733,7 +1733,7 @@ void MeshRefinement::uniformly_coarsen (unsigned int n)
               parents_to_coarsen[elem->processor_id()].push_back(elem->id());
 
           Parallel::MessageTag
-            coarsen_tag = this->comm().get_unique_tag(271);
+            coarsen_tag = this->comm().get_unique_tag();
           std::vector<Parallel::Request> coarsen_push_requests(n_proc-1);
 
           for (processor_id_type p = 0; p != n_proc; ++p)
diff --git a/src/mesh/mesh_tools.C b/src/mesh/mesh_tools.C
index d961fc43539..f11995008d0 100644
--- a/src/mesh/mesh_tools.C
+++ b/src/mesh/mesh_tools.C
@@ -2365,11 +2365,8 @@ void MeshTools::correct_node_proc_ids (MeshBase & mesh)
         }
     };
 
-  // Push using non-blocking I/O
-  std::vector<Parallel::Request> push_requests;
-
   Parallel::push_parallel_vector_data
-    (mesh.comm(), ids_to_push, push_requests, action_functor);
+    (mesh.comm(), ids_to_push, action_functor);
 
   // Now new_proc_ids is correct for every node we used to own.  Let's
   // ask every other processor about the nodes they used to own.  But
@@ -2383,9 +2380,6 @@ void MeshTools::correct_node_proc_ids (MeshBase & mesh)
         ex_local_nodes.insert(node);
     }
 
-  // Let's finish with previous I/O before we start more.
-  Parallel::wait(push_requests);
-
   SyncProcIdsFromMap sync(new_proc_ids, mesh);
   if (repartition_all_nodes)
     Parallel::sync_dofobject_data_by_id
diff --git a/src/mesh/nemesis_io.C b/src/mesh/nemesis_io.C
index bdb21a322e9..3ec967114ec 100644
--- a/src/mesh/nemesis_io.C
+++ b/src/mesh/nemesis_io.C
@@ -376,7 +376,7 @@ void Nemesis_IO::read (const std::string & base_filename)
   // we do not own
 
   // Let's get a unique message tag to use for send()/receive()
-  Parallel::MessageTag nodes_tag = mesh.comm().get_unique_tag(12345);
+  Parallel::MessageTag nodes_tag = mesh.comm().get_unique_tag();
 
   std::vector<std::vector<int>>
     needed_node_idxs (nemhelper->num_node_cmaps); // the indices we will ask for
diff --git a/src/mesh/xdr_io.C b/src/mesh/xdr_io.C
index a81adcfbb3e..7598ae338f3 100644
--- a/src/mesh/xdr_io.C
+++ b/src/mesh/xdr_io.C
@@ -770,8 +770,8 @@ void XdrIO::write_serialized_nodes (Xdr & io, const dof_id_type max_node_id) con
         coord_request_handles(this->n_processors()-1);
 
       Parallel::MessageTag
-        id_tag    = mesh.comm().get_unique_tag(1234),
-        coord_tag = mesh.comm().get_unique_tag(1235);
+        id_tag    = mesh.comm().get_unique_tag(),
+        coord_tag = mesh.comm().get_unique_tag();
 
       // Post the receives -- do this on processor 0 only.
       if (this->processor_id() == 0)
@@ -909,8 +909,8 @@ void XdrIO::write_serialized_nodes (Xdr & io, const dof_id_type max_node_id) con
         id_request_handles(this->n_processors()-1);
 
       Parallel::MessageTag
-        unique_id_tag = mesh.comm().get_unique_tag(1236),
-        id_tag    = mesh.comm().get_unique_tag(1237);
+        unique_id_tag = mesh.comm().get_unique_tag(),
+        id_tag    = mesh.comm().get_unique_tag();
 
       // Post the receives -- do this on processor 0 only.
       if (this->processor_id() == 0)
diff --git a/src/parallel/communicator.C b/src/parallel/communicator.C
index 304d888b191..6e6de126556 100644
--- a/src/parallel/communicator.C
+++ b/src/parallel/communicator.C
@@ -21,6 +21,7 @@
 
 // libMesh includes
 #include "libmesh/libmesh_logging.h"
+#include "libmesh/parallel_implementation.h" // for inline max(int)
 
 namespace libMesh
 {
@@ -62,6 +63,8 @@ Communicator::Communicator () :
   _size(1),
   _send_mode(DEFAULT),
   used_tag_values(),
+  _next_tag(0),
+  _max_tag(std::numeric_limits<int>::max()),
   _I_duped_it(false) {}
 
 
@@ -73,6 +76,8 @@ Communicator::Communicator (const communicator & comm) :
   _size(1),
   _send_mode(DEFAULT),
   used_tag_values(),
+  _next_tag(0),
+  _max_tag(std::numeric_limits<int>::max()),
   _I_duped_it(false)
 {
   this->assign(comm);
@@ -170,12 +175,20 @@ void Communicator::assign(const communicator & comm)
 
       libmesh_assert_greater_equal (i, 0);
       _rank = cast_int<processor_id_type>(i);
+
+      int * maxTag;
+      int flag = false;
+      libmesh_call_mpi(MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &maxTag, &flag));
+      libmesh_assert(flag);
+      _max_tag = *maxTag;
     }
   else
     {
       _rank = 0;
       _size = 1;
+      _max_tag = std::numeric_limits<int>::max();
     }
+  _next_tag = _max_tag / 2;
 #endif
   _send_mode = DEFAULT;
 }
@@ -197,9 +210,33 @@ void Communicator::barrier () const
 void Communicator::barrier () const {}
 #endif
 
+#ifdef LIBMESH_HAVE_MPI
+void Communicator::nonblocking_barrier (Request & req) const
+{
+  if (this->size() > 1)
+    {
+      LOG_SCOPE("nonblocking_barrier()", "Parallel");
+      libmesh_call_mpi(MPI_Ibarrier (this->get(), req.get()));
+    }
+}
+#else
+void Communicator::nonblocking_barrier (Request & /*req*/) const {}
+#endif
+
 
 MessageTag Communicator::get_unique_tag(int tagvalue) const
 {
+  if (tagvalue == MessageTag::invalid_tag)
+    {
+#ifndef NDEBUG
+      // Automatic tag values have to be requested in sync
+      int maxval = _next_tag;
+      this->max(maxval);
+      libmesh_assert_equal_to(_next_tag, maxval);
+#endif
+      tagvalue = _next_tag++;
+    }
+
   if (used_tag_values.count(tagvalue))
     {
       // Get the largest value in the used values, and pick one
@@ -207,15 +244,13 @@ MessageTag Communicator::get_unique_tag(int tagvalue) const
       tagvalue = used_tag_values.rbegin()->first+1;
       libmesh_assert(!used_tag_values.count(tagvalue));
     }
-  used_tag_values[tagvalue] = 1;
+  if (tagvalue >= _next_tag)
+    _next_tag = tagvalue+1;
 
-  // #ifndef NDEBUG
-  //   // Make sure everyone called get_unique_tag and make sure
-  //   // everyone got the same value
-  //   int maxval = tagvalue;
-  //   this->max(maxval);
-  //   libmesh_assert_equal_to (tagvalue, maxval);
-  // #endif
+  if (_next_tag >= _max_tag)
+    _next_tag = _max_tag/2;
+
+  used_tag_values[tagvalue] = 1;
 
   return MessageTag(tagvalue, this);
 }
diff --git a/src/systems/system_io.C b/src/systems/system_io.C
index 12bb3a94d92..a0ff2c21780 100644
--- a/src/systems/system_io.C
+++ b/src/systems/system_io.C
@@ -1133,7 +1133,7 @@ unsigned int System::read_SCALAR_dofs (const unsigned int var,
 #ifdef LIBMESH_HAVE_MPI
   if (this->n_processors() > 1)
     {
-      const Parallel::MessageTag val_tag = this->comm().get_unique_tag(321);
+      const Parallel::MessageTag val_tag = this->comm().get_unique_tag();
 
       // Post the receive on the last processor
       if (this->processor_id() == (this->n_processors()-1))
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0e1afa8115a..6742347bbc9 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -71,8 +71,10 @@ unit_tests_sources = \
   numerics/vector_value_test.C \
   numerics/type_tensor_test.C \
   numerics/dense_matrix_test.C \
+  parallel/message_tag.C \
   parallel/packed_range_test.C \
   parallel/parallel_sort_test.C \
+  parallel/parallel_sync_test.C \
   parallel/parallel_test.C \
   parallel/parallel_point_test.C \
   partitioning/partitioner_test.h \
diff --git a/tests/Makefile.in b/tests/Makefile.in
index 3a86aacf9f6..c70722522ea 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -210,7 +210,8 @@ am__unit_tests_dbg_SOURCES_DIST = driver.C test_comm.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -276,8 +277,10 @@ am__objects_2 = unit_tests_dbg-driver.$(OBJEXT) \
 	numerics/unit_tests_dbg-vector_value_test.$(OBJEXT) \
 	numerics/unit_tests_dbg-type_tensor_test.$(OBJEXT) \
 	numerics/unit_tests_dbg-dense_matrix_test.$(OBJEXT) \
+	parallel/unit_tests_dbg-message_tag.$(OBJEXT) \
 	parallel/unit_tests_dbg-packed_range_test.$(OBJEXT) \
 	parallel/unit_tests_dbg-parallel_sort_test.$(OBJEXT) \
+	parallel/unit_tests_dbg-parallel_sync_test.$(OBJEXT) \
 	parallel/unit_tests_dbg-parallel_test.$(OBJEXT) \
 	parallel/unit_tests_dbg-parallel_point_test.$(OBJEXT) \
 	partitioning/unit_tests_dbg-centroid_partitioner_test.$(OBJEXT) \
@@ -333,7 +336,8 @@ am__unit_tests_devel_SOURCES_DIST = driver.C test_comm.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -398,8 +402,10 @@ am__objects_4 = unit_tests_devel-driver.$(OBJEXT) \
 	numerics/unit_tests_devel-vector_value_test.$(OBJEXT) \
 	numerics/unit_tests_devel-type_tensor_test.$(OBJEXT) \
 	numerics/unit_tests_devel-dense_matrix_test.$(OBJEXT) \
+	parallel/unit_tests_devel-message_tag.$(OBJEXT) \
 	parallel/unit_tests_devel-packed_range_test.$(OBJEXT) \
 	parallel/unit_tests_devel-parallel_sort_test.$(OBJEXT) \
+	parallel/unit_tests_devel-parallel_sync_test.$(OBJEXT) \
 	parallel/unit_tests_devel-parallel_test.$(OBJEXT) \
 	parallel/unit_tests_devel-parallel_point_test.$(OBJEXT) \
 	partitioning/unit_tests_devel-centroid_partitioner_test.$(OBJEXT) \
@@ -452,7 +458,8 @@ am__unit_tests_oprof_SOURCES_DIST = driver.C test_comm.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -517,8 +524,10 @@ am__objects_6 = unit_tests_oprof-driver.$(OBJEXT) \
 	numerics/unit_tests_oprof-vector_value_test.$(OBJEXT) \
 	numerics/unit_tests_oprof-type_tensor_test.$(OBJEXT) \
 	numerics/unit_tests_oprof-dense_matrix_test.$(OBJEXT) \
+	parallel/unit_tests_oprof-message_tag.$(OBJEXT) \
 	parallel/unit_tests_oprof-packed_range_test.$(OBJEXT) \
 	parallel/unit_tests_oprof-parallel_sort_test.$(OBJEXT) \
+	parallel/unit_tests_oprof-parallel_sync_test.$(OBJEXT) \
 	parallel/unit_tests_oprof-parallel_test.$(OBJEXT) \
 	parallel/unit_tests_oprof-parallel_point_test.$(OBJEXT) \
 	partitioning/unit_tests_oprof-centroid_partitioner_test.$(OBJEXT) \
@@ -571,7 +580,8 @@ am__unit_tests_opt_SOURCES_DIST = driver.C test_comm.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -636,8 +646,10 @@ am__objects_8 = unit_tests_opt-driver.$(OBJEXT) \
 	numerics/unit_tests_opt-vector_value_test.$(OBJEXT) \
 	numerics/unit_tests_opt-type_tensor_test.$(OBJEXT) \
 	numerics/unit_tests_opt-dense_matrix_test.$(OBJEXT) \
+	parallel/unit_tests_opt-message_tag.$(OBJEXT) \
 	parallel/unit_tests_opt-packed_range_test.$(OBJEXT) \
 	parallel/unit_tests_opt-parallel_sort_test.$(OBJEXT) \
+	parallel/unit_tests_opt-parallel_sync_test.$(OBJEXT) \
 	parallel/unit_tests_opt-parallel_test.$(OBJEXT) \
 	parallel/unit_tests_opt-parallel_point_test.$(OBJEXT) \
 	partitioning/unit_tests_opt-centroid_partitioner_test.$(OBJEXT) \
@@ -689,7 +701,8 @@ am__unit_tests_prof_SOURCES_DIST = driver.C test_comm.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -754,8 +767,10 @@ am__objects_10 = unit_tests_prof-driver.$(OBJEXT) \
 	numerics/unit_tests_prof-vector_value_test.$(OBJEXT) \
 	numerics/unit_tests_prof-type_tensor_test.$(OBJEXT) \
 	numerics/unit_tests_prof-dense_matrix_test.$(OBJEXT) \
+	parallel/unit_tests_prof-message_tag.$(OBJEXT) \
 	parallel/unit_tests_prof-packed_range_test.$(OBJEXT) \
 	parallel/unit_tests_prof-parallel_sort_test.$(OBJEXT) \
+	parallel/unit_tests_prof-parallel_sync_test.$(OBJEXT) \
 	parallel/unit_tests_prof-parallel_test.$(OBJEXT) \
 	parallel/unit_tests_prof-parallel_point_test.$(OBJEXT) \
 	partitioning/unit_tests_prof-centroid_partitioner_test.$(OBJEXT) \
@@ -1040,25 +1055,35 @@ am__depfiles_remade = ./$(DEPDIR)/unit_tests_dbg-driver.Po \
 	numerics/$(DEPDIR)/unit_tests_prof-trilinos_epetra_vector_test.Po \
 	numerics/$(DEPDIR)/unit_tests_prof-type_tensor_test.Po \
 	numerics/$(DEPDIR)/unit_tests_prof-vector_value_test.Po \
+	parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po \
 	parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Po \
 	parallel/$(DEPDIR)/unit_tests_dbg-parallel_point_test.Po \
 	parallel/$(DEPDIR)/unit_tests_dbg-parallel_sort_test.Po \
+	parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po \
 	parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Po \
+	parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po \
 	parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Po \
 	parallel/$(DEPDIR)/unit_tests_devel-parallel_point_test.Po \
 	parallel/$(DEPDIR)/unit_tests_devel-parallel_sort_test.Po \
+	parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po \
 	parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Po \
+	parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po \
 	parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Po \
 	parallel/$(DEPDIR)/unit_tests_oprof-parallel_point_test.Po \
 	parallel/$(DEPDIR)/unit_tests_oprof-parallel_sort_test.Po \
+	parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po \
 	parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Po \
+	parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po \
 	parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Po \
 	parallel/$(DEPDIR)/unit_tests_opt-parallel_point_test.Po \
 	parallel/$(DEPDIR)/unit_tests_opt-parallel_sort_test.Po \
+	parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po \
 	parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Po \
+	parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po \
 	parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Po \
 	parallel/$(DEPDIR)/unit_tests_prof-parallel_point_test.Po \
 	parallel/$(DEPDIR)/unit_tests_prof-parallel_sort_test.Po \
+	parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po \
 	parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Po \
 	partitioning/$(DEPDIR)/unit_tests_dbg-centroid_partitioner_test.Po \
 	partitioning/$(DEPDIR)/unit_tests_dbg-hilbert_sfc_partitioner_test.Po \
@@ -1569,7 +1594,8 @@ unit_tests_sources = driver.C test_comm.h stream_redirector.h \
 	numerics/trilinos_epetra_vector_test.C \
 	numerics/type_vector_test.h numerics/vector_value_test.C \
 	numerics/type_tensor_test.C numerics/dense_matrix_test.C \
-	parallel/packed_range_test.C parallel/parallel_sort_test.C \
+	parallel/message_tag.C parallel/packed_range_test.C \
+	parallel/parallel_sort_test.C parallel/parallel_sync_test.C \
 	parallel/parallel_test.C parallel/parallel_point_test.C \
 	partitioning/partitioner_test.h \
 	partitioning/centroid_partitioner_test.C \
@@ -1791,10 +1817,14 @@ parallel/$(am__dirstamp):
 parallel/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) parallel/$(DEPDIR)
 	@: > parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_dbg-message_tag.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_dbg-packed_range_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_dbg-parallel_sort_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_dbg-parallel_sync_test.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_dbg-parallel_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_dbg-parallel_point_test.$(OBJEXT):  \
@@ -1971,10 +2001,14 @@ numerics/unit_tests_devel-type_tensor_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
 numerics/unit_tests_devel-dense_matrix_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_devel-message_tag.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_devel-packed_range_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_devel-parallel_sort_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_devel-parallel_sync_test.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_devel-parallel_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_devel-parallel_point_test.$(OBJEXT):  \
@@ -2115,10 +2149,14 @@ numerics/unit_tests_oprof-type_tensor_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
 numerics/unit_tests_oprof-dense_matrix_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_oprof-message_tag.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_oprof-packed_range_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_oprof-parallel_sort_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_oprof-parallel_sync_test.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_oprof-parallel_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_oprof-parallel_point_test.$(OBJEXT):  \
@@ -2259,10 +2297,14 @@ numerics/unit_tests_opt-type_tensor_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
 numerics/unit_tests_opt-dense_matrix_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_opt-message_tag.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_opt-packed_range_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_opt-parallel_sort_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_opt-parallel_sync_test.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_opt-parallel_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_opt-parallel_point_test.$(OBJEXT):  \
@@ -2403,10 +2445,14 @@ numerics/unit_tests_prof-type_tensor_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
 numerics/unit_tests_prof-dense_matrix_test.$(OBJEXT):  \
 	numerics/$(am__dirstamp) numerics/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_prof-message_tag.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_prof-packed_range_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_prof-parallel_sort_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
+parallel/unit_tests_prof-parallel_sync_test.$(OBJEXT):  \
+	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_prof-parallel_test.$(OBJEXT):  \
 	parallel/$(am__dirstamp) parallel/$(DEPDIR)/$(am__dirstamp)
 parallel/unit_tests_prof-parallel_point_test.$(OBJEXT):  \
@@ -2717,25 +2763,35 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@numerics/$(DEPDIR)/unit_tests_prof-trilinos_epetra_vector_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@numerics/$(DEPDIR)/unit_tests_prof-type_tensor_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@numerics/$(DEPDIR)/unit_tests_prof-vector_value_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-parallel_point_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-parallel_sort_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-parallel_point_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-parallel_sort_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-parallel_point_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-parallel_sort_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-parallel_point_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-parallel_sort_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-parallel_point_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-parallel_sort_test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@partitioning/$(DEPDIR)/unit_tests_dbg-centroid_partitioner_test.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@partitioning/$(DEPDIR)/unit_tests_dbg-hilbert_sfc_partitioner_test.Po@am__quote@ # am--include-marker
@@ -3510,6 +3566,20 @@ numerics/unit_tests_dbg-dense_matrix_test.obj: numerics/dense_matrix_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o numerics/unit_tests_dbg-dense_matrix_test.obj `if test -f 'numerics/dense_matrix_test.C'; then $(CYGPATH_W) 'numerics/dense_matrix_test.C'; else $(CYGPATH_W) '$(srcdir)/numerics/dense_matrix_test.C'; fi`
 
+parallel/unit_tests_dbg-message_tag.o: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-message_tag.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Tpo -c -o parallel/unit_tests_dbg-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_dbg-message_tag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_dbg-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+
+parallel/unit_tests_dbg-message_tag.obj: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-message_tag.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Tpo -c -o parallel/unit_tests_dbg-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_dbg-message_tag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_dbg-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+
 parallel/unit_tests_dbg-packed_range_test.o: parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-packed_range_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Tpo -c -o parallel/unit_tests_dbg-packed_range_test.o `test -f 'parallel/packed_range_test.C' || echo '$(srcdir)/'`parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Tpo parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Po
@@ -3538,6 +3608,20 @@ parallel/unit_tests_dbg-parallel_sort_test.obj: parallel/parallel_sort_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_dbg-parallel_sort_test.obj `if test -f 'parallel/parallel_sort_test.C'; then $(CYGPATH_W) 'parallel/parallel_sort_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sort_test.C'; fi`
 
+parallel/unit_tests_dbg-parallel_sync_test.o: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-parallel_sync_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Tpo -c -o parallel/unit_tests_dbg-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_dbg-parallel_sync_test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_dbg-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+
+parallel/unit_tests_dbg-parallel_sync_test.obj: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-parallel_sync_test.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Tpo -c -o parallel/unit_tests_dbg-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_dbg-parallel_sync_test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_dbg-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+
 parallel/unit_tests_dbg-parallel_test.o: parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_dbg_CPPFLAGS) $(CPPFLAGS) $(unit_tests_dbg_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_dbg-parallel_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Tpo -c -o parallel/unit_tests_dbg-parallel_test.o `test -f 'parallel/parallel_test.C' || echo '$(srcdir)/'`parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Tpo parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Po
@@ -4448,6 +4532,20 @@ numerics/unit_tests_devel-dense_matrix_test.obj: numerics/dense_matrix_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o numerics/unit_tests_devel-dense_matrix_test.obj `if test -f 'numerics/dense_matrix_test.C'; then $(CYGPATH_W) 'numerics/dense_matrix_test.C'; else $(CYGPATH_W) '$(srcdir)/numerics/dense_matrix_test.C'; fi`
 
+parallel/unit_tests_devel-message_tag.o: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-message_tag.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-message_tag.Tpo -c -o parallel/unit_tests_devel-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_devel-message_tag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_devel-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+
+parallel/unit_tests_devel-message_tag.obj: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-message_tag.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-message_tag.Tpo -c -o parallel/unit_tests_devel-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_devel-message_tag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_devel-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+
 parallel/unit_tests_devel-packed_range_test.o: parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-packed_range_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Tpo -c -o parallel/unit_tests_devel-packed_range_test.o `test -f 'parallel/packed_range_test.C' || echo '$(srcdir)/'`parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Tpo parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Po
@@ -4476,6 +4574,20 @@ parallel/unit_tests_devel-parallel_sort_test.obj: parallel/parallel_sort_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_devel-parallel_sort_test.obj `if test -f 'parallel/parallel_sort_test.C'; then $(CYGPATH_W) 'parallel/parallel_sort_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sort_test.C'; fi`
 
+parallel/unit_tests_devel-parallel_sync_test.o: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-parallel_sync_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Tpo -c -o parallel/unit_tests_devel-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_devel-parallel_sync_test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_devel-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+
+parallel/unit_tests_devel-parallel_sync_test.obj: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-parallel_sync_test.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Tpo -c -o parallel/unit_tests_devel-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_devel-parallel_sync_test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_devel-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+
 parallel/unit_tests_devel-parallel_test.o: parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_devel_CPPFLAGS) $(CPPFLAGS) $(unit_tests_devel_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_devel-parallel_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Tpo -c -o parallel/unit_tests_devel-parallel_test.o `test -f 'parallel/parallel_test.C' || echo '$(srcdir)/'`parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Tpo parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Po
@@ -5386,6 +5498,20 @@ numerics/unit_tests_oprof-dense_matrix_test.obj: numerics/dense_matrix_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o numerics/unit_tests_oprof-dense_matrix_test.obj `if test -f 'numerics/dense_matrix_test.C'; then $(CYGPATH_W) 'numerics/dense_matrix_test.C'; else $(CYGPATH_W) '$(srcdir)/numerics/dense_matrix_test.C'; fi`
 
+parallel/unit_tests_oprof-message_tag.o: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-message_tag.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Tpo -c -o parallel/unit_tests_oprof-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_oprof-message_tag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_oprof-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+
+parallel/unit_tests_oprof-message_tag.obj: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-message_tag.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Tpo -c -o parallel/unit_tests_oprof-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_oprof-message_tag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_oprof-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+
 parallel/unit_tests_oprof-packed_range_test.o: parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-packed_range_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Tpo -c -o parallel/unit_tests_oprof-packed_range_test.o `test -f 'parallel/packed_range_test.C' || echo '$(srcdir)/'`parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Tpo parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Po
@@ -5414,6 +5540,20 @@ parallel/unit_tests_oprof-parallel_sort_test.obj: parallel/parallel_sort_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_oprof-parallel_sort_test.obj `if test -f 'parallel/parallel_sort_test.C'; then $(CYGPATH_W) 'parallel/parallel_sort_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sort_test.C'; fi`
 
+parallel/unit_tests_oprof-parallel_sync_test.o: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-parallel_sync_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Tpo -c -o parallel/unit_tests_oprof-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_oprof-parallel_sync_test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_oprof-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+
+parallel/unit_tests_oprof-parallel_sync_test.obj: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-parallel_sync_test.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Tpo -c -o parallel/unit_tests_oprof-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_oprof-parallel_sync_test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_oprof-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+
 parallel/unit_tests_oprof-parallel_test.o: parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_oprof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_oprof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_oprof-parallel_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Tpo -c -o parallel/unit_tests_oprof-parallel_test.o `test -f 'parallel/parallel_test.C' || echo '$(srcdir)/'`parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Tpo parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Po
@@ -6324,6 +6464,20 @@ numerics/unit_tests_opt-dense_matrix_test.obj: numerics/dense_matrix_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o numerics/unit_tests_opt-dense_matrix_test.obj `if test -f 'numerics/dense_matrix_test.C'; then $(CYGPATH_W) 'numerics/dense_matrix_test.C'; else $(CYGPATH_W) '$(srcdir)/numerics/dense_matrix_test.C'; fi`
 
+parallel/unit_tests_opt-message_tag.o: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-message_tag.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-message_tag.Tpo -c -o parallel/unit_tests_opt-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_opt-message_tag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_opt-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+
+parallel/unit_tests_opt-message_tag.obj: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-message_tag.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-message_tag.Tpo -c -o parallel/unit_tests_opt-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_opt-message_tag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_opt-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+
 parallel/unit_tests_opt-packed_range_test.o: parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-packed_range_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Tpo -c -o parallel/unit_tests_opt-packed_range_test.o `test -f 'parallel/packed_range_test.C' || echo '$(srcdir)/'`parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Tpo parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Po
@@ -6352,6 +6506,20 @@ parallel/unit_tests_opt-parallel_sort_test.obj: parallel/parallel_sort_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_opt-parallel_sort_test.obj `if test -f 'parallel/parallel_sort_test.C'; then $(CYGPATH_W) 'parallel/parallel_sort_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sort_test.C'; fi`
 
+parallel/unit_tests_opt-parallel_sync_test.o: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-parallel_sync_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Tpo -c -o parallel/unit_tests_opt-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_opt-parallel_sync_test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_opt-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+
+parallel/unit_tests_opt-parallel_sync_test.obj: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-parallel_sync_test.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Tpo -c -o parallel/unit_tests_opt-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_opt-parallel_sync_test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_opt-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+
 parallel/unit_tests_opt-parallel_test.o: parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_opt_CPPFLAGS) $(CPPFLAGS) $(unit_tests_opt_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_opt-parallel_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Tpo -c -o parallel/unit_tests_opt-parallel_test.o `test -f 'parallel/parallel_test.C' || echo '$(srcdir)/'`parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Tpo parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Po
@@ -7262,6 +7430,20 @@ numerics/unit_tests_prof-dense_matrix_test.obj: numerics/dense_matrix_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o numerics/unit_tests_prof-dense_matrix_test.obj `if test -f 'numerics/dense_matrix_test.C'; then $(CYGPATH_W) 'numerics/dense_matrix_test.C'; else $(CYGPATH_W) '$(srcdir)/numerics/dense_matrix_test.C'; fi`
 
+parallel/unit_tests_prof-message_tag.o: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-message_tag.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-message_tag.Tpo -c -o parallel/unit_tests_prof-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_prof-message_tag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_prof-message_tag.o `test -f 'parallel/message_tag.C' || echo '$(srcdir)/'`parallel/message_tag.C
+
+parallel/unit_tests_prof-message_tag.obj: parallel/message_tag.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-message_tag.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-message_tag.Tpo -c -o parallel/unit_tests_prof-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-message_tag.Tpo parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/message_tag.C' object='parallel/unit_tests_prof-message_tag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_prof-message_tag.obj `if test -f 'parallel/message_tag.C'; then $(CYGPATH_W) 'parallel/message_tag.C'; else $(CYGPATH_W) '$(srcdir)/parallel/message_tag.C'; fi`
+
 parallel/unit_tests_prof-packed_range_test.o: parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-packed_range_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Tpo -c -o parallel/unit_tests_prof-packed_range_test.o `test -f 'parallel/packed_range_test.C' || echo '$(srcdir)/'`parallel/packed_range_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Tpo parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Po
@@ -7290,6 +7472,20 @@ parallel/unit_tests_prof-parallel_sort_test.obj: parallel/parallel_sort_test.C
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_prof-parallel_sort_test.obj `if test -f 'parallel/parallel_sort_test.C'; then $(CYGPATH_W) 'parallel/parallel_sort_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sort_test.C'; fi`
 
+parallel/unit_tests_prof-parallel_sync_test.o: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-parallel_sync_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Tpo -c -o parallel/unit_tests_prof-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_prof-parallel_sync_test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_prof-parallel_sync_test.o `test -f 'parallel/parallel_sync_test.C' || echo '$(srcdir)/'`parallel/parallel_sync_test.C
+
+parallel/unit_tests_prof-parallel_sync_test.obj: parallel/parallel_sync_test.C
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-parallel_sync_test.obj -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Tpo -c -o parallel/unit_tests_prof-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Tpo parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='parallel/parallel_sync_test.C' object='parallel/unit_tests_prof-parallel_sync_test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -c -o parallel/unit_tests_prof-parallel_sync_test.obj `if test -f 'parallel/parallel_sync_test.C'; then $(CYGPATH_W) 'parallel/parallel_sync_test.C'; else $(CYGPATH_W) '$(srcdir)/parallel/parallel_sync_test.C'; fi`
+
 parallel/unit_tests_prof-parallel_test.o: parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(unit_tests_prof_CPPFLAGS) $(CPPFLAGS) $(unit_tests_prof_CXXFLAGS) $(CXXFLAGS) -MT parallel/unit_tests_prof-parallel_test.o -MD -MP -MF parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Tpo -c -o parallel/unit_tests_prof-parallel_test.o `test -f 'parallel/parallel_test.C' || echo '$(srcdir)/'`parallel/parallel_test.C
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Tpo parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Po
@@ -8024,25 +8220,35 @@ distclean: distclean-am
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-trilinos_epetra_vector_test.Po
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-type_tensor_test.Po
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-vector_value_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Po
 	-rm -f partitioning/$(DEPDIR)/unit_tests_dbg-centroid_partitioner_test.Po
 	-rm -f partitioning/$(DEPDIR)/unit_tests_dbg-hilbert_sfc_partitioner_test.Po
@@ -8404,25 +8610,35 @@ maintainer-clean: maintainer-clean-am
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-trilinos_epetra_vector_test.Po
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-type_tensor_test.Po
 	-rm -f numerics/$(DEPDIR)/unit_tests_prof-vector_value_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_dbg-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_devel-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_devel-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_oprof-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_opt-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_opt-parallel_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_prof-message_tag.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-packed_range_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_point_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_sort_test.Po
+	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_sync_test.Po
 	-rm -f parallel/$(DEPDIR)/unit_tests_prof-parallel_test.Po
 	-rm -f partitioning/$(DEPDIR)/unit_tests_dbg-centroid_partitioner_test.Po
 	-rm -f partitioning/$(DEPDIR)/unit_tests_dbg-hilbert_sfc_partitioner_test.Po
diff --git a/tests/parallel/message_tag.C b/tests/parallel/message_tag.C
new file mode 100644
index 00000000000..b89a25a0078
--- /dev/null
+++ b/tests/parallel/message_tag.C
@@ -0,0 +1,103 @@
+// Ignore unused parameter warnings coming from cppunit headers
+#include <libmesh/ignore_warnings.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+#include <libmesh/restore_warnings.h>
+
+#include <libmesh/communicator.h>
+#include <libmesh/message_tag.h>
+
+#include "test_comm.h"
+
+// THE CPPUNIT_TEST_SUITE_END macro expands to code that involves
+// std::auto_ptr, which in turn produces -Wdeprecated-declarations
+// warnings.  These can be ignored in GCC as long as we wrap the
+// offending code in appropriate pragmas.  We can't get away with a
+// single ignore_warnings.h inclusion at the beginning of this file,
+// since the libmesh headers pull in a restore_warnings.h at some
+// point.  We also don't bother restoring warnings at the end of this
+// file since it's not a header.
+#include <libmesh/ignore_warnings.h>
+
+using namespace libMesh;
+
+class MessageTagTest : public CppUnit::TestCase {
+public:
+  CPPUNIT_TEST_SUITE( MessageTagTest );
+
+  CPPUNIT_TEST( testGetUniqueTagAuto );
+  CPPUNIT_TEST( testGetUniqueTagManual );
+
+  CPPUNIT_TEST_SUITE_END();
+
+private:
+  std::vector<std::string> _number;
+
+public:
+  void setUp()
+  {}
+
+  void tearDown()
+  {}
+
+
+
+  void testGetUniqueTagAuto()
+  {
+    // We need to duplicate the communicator first, because the
+    // original might already have tags used by other unit tests
+
+    Parallel::Communicator newcomm;
+
+    TestCommWorld->duplicate(newcomm);
+
+    const int n_vals = 5;
+    const int n_vals_in_scope = 3;
+    std::vector<int> vals(n_vals);
+
+    {
+      std::vector<Parallel::MessageTag> tags(n_vals_in_scope);
+      for (int i=0; i != n_vals_in_scope; ++i)
+        {
+          tags[i] = newcomm.get_unique_tag();
+          vals[i] = tags[i].value();
+          for (int j=0; j != i; ++j)
+            {
+              CPPUNIT_ASSERT(vals[i] != vals[j]);
+            }
+        }
+    }
+
+    // Even after we go out of scope those values should be used up
+    for (int i=n_vals_in_scope; i != n_vals; ++i)
+      {
+        Parallel::MessageTag another_tag = newcomm.get_unique_tag();
+        vals[i] = another_tag.value();
+        for (int j=0; j != i; ++j)
+          {
+            CPPUNIT_ASSERT(vals[i] != vals[j]);
+          }
+      }
+  }
+
+
+
+  void testGetUniqueTagManual()
+  {
+    // Here we'll use the standard communicator, because even if it
+    // used these tags in other contexts it should have freed them for
+    // reuse later.
+
+    const int requests[] = {2, 4, 6, 8, 8, 6, 8, 123, 3141, 3142};
+
+    for (const int i : requests)
+      {
+        Parallel::MessageTag manual_tag =
+          TestCommWorld->get_unique_tag(i);
+        CPPUNIT_ASSERT_EQUAL(i, manual_tag.value());
+      }
+  }
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION( MessageTagTest );
diff --git a/tests/parallel/parallel_sync_test.C b/tests/parallel/parallel_sync_test.C
new file mode 100644
index 00000000000..7bc7fc1f924
--- /dev/null
+++ b/tests/parallel/parallel_sync_test.C
@@ -0,0 +1,590 @@
+// Ignore unused parameter warnings coming from cppunit headers
+#include <libmesh/ignore_warnings.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+#include <libmesh/restore_warnings.h>
+
+#include <libmesh/parallel_sync.h>
+#include <libmesh/int_range.h>
+#include <libmesh/simple_range.h>
+
+#include "test_comm.h"
+
+#include <algorithm>
+
+// THE CPPUNIT_TEST_SUITE_END macro expands to code that involves
+// std::auto_ptr, which in turn produces -Wdeprecated-declarations
+// warnings.  These can be ignored in GCC as long as we wrap the
+// offending code in appropriate pragmas.  We can't get away with a
+// single ignore_warnings.h inclusion at the beginning of this file,
+// since the libmesh headers pull in a restore_warnings.h at some
+// point.  We also don't bother restoring warnings at the end of this
+// file since it's not a header.
+#include <libmesh/ignore_warnings.h>
+
+using namespace libMesh;
+
+class ParallelSyncTest : public CppUnit::TestCase {
+public:
+  CPPUNIT_TEST_SUITE( ParallelSyncTest );
+
+  // Our sync functions are most typically used with a map of
+  // processor ids that *only* includes ranks currently running.
+  CPPUNIT_TEST( testPush );
+  CPPUNIT_TEST( testPull );
+  CPPUNIT_TEST( testPushVecVec );
+  CPPUNIT_TEST( testPullVecVec );
+  CPPUNIT_TEST( testPushMultimap );
+  CPPUNIT_TEST( testPushMultimapVecVec );
+
+  // Our sync functions need to support sending to ranks that don't
+  // exist!  If we're on N processors but working on a mesh
+  // partitioned into M parts with M > N, then subpartition p belongs
+  // to processor p%N.  Let's make M > N for these tests.
+  CPPUNIT_TEST( testPushOversized );
+  CPPUNIT_TEST( testPullOversized );
+  CPPUNIT_TEST( testPushVecVecOversized );
+  CPPUNIT_TEST( testPullVecVecOversized );
+  CPPUNIT_TEST( testPushMultimapOversized );
+  CPPUNIT_TEST( testPushMultimapVecVecOversized );
+
+  CPPUNIT_TEST_SUITE_END();
+
+public:
+  void setUp()
+  {}
+
+  void tearDown()
+  {}
+
+
+  // Data to send/recieve with each processor rank.  For this test,
+  // processor p will send to destination d the integer d, in a vector
+  // with sqrt(c)+1 copies, iff c := |p-d| is a square number.
+  void fill_scalar_data
+    (std::map<processor_id_type, std::vector<unsigned int>> & data,
+     int M)
+  {
+    const int rank = TestCommWorld->rank();
+    for (int d=0; d != M; ++d)
+      {
+        int diffsize = std::abs(d-rank);
+        int diffsqrt = std::sqrt(diffsize);
+        if (diffsqrt*diffsqrt == diffsize)
+          for (int i=-1; i != diffsqrt; ++i)
+            data[d].push_back(d);
+      }
+  }
+
+
+  // Multimap data to send/recieve with each processor rank.  For this
+  // test, processor p will send to destination d the integer d, in a
+  // vector with sqrt(c)+1 copies followed by a vector with 1 copy,
+  // iff c := |p-d| is a square number.
+  void fill_scalar_data
+    (std::multimap<processor_id_type, std::vector<unsigned int>> & data,
+     int M)
+  {
+    const int rank = TestCommWorld->rank();
+    for (int d=0; d != M; ++d)
+      {
+        int diffsize = std::abs(d-rank);
+        int diffsqrt = std::sqrt(diffsize);
+        if (diffsqrt*diffsqrt == diffsize)
+          {
+            std::vector<unsigned int> v;
+            for (int i=-1; i != diffsqrt; ++i)
+              v.push_back(d);
+            data.emplace(d, v);
+            v.resize(1, d);
+            data.emplace(d, v);
+          }
+      }
+  }
+
+
+  // Data to send/recieve with each processor rank.  For this test,
+  // processor p will send to destination d the integer d, in two
+  // subvectors with sqrt(c) and 1 copies, iff c := |p-d| is a square
+  // number.
+  void fill_vector_data
+    (std::map<processor_id_type, std::vector<std::vector<unsigned int>>> & data,
+     int M)
+  {
+    const int rank = TestCommWorld->rank();
+    for (int d=0; d != M; ++d)
+      {
+        int diffsize = std::abs(d-rank);
+        int diffsqrt = std::sqrt(diffsize);
+        if (diffsqrt*diffsqrt == diffsize)
+          {
+            data[d].resize(2);
+            for (int i=-1; i != diffsqrt; ++i)
+              data[d][0].push_back(d);
+            data[d][1].push_back(d);
+          }
+      }
+  }
+
+
+
+  // Multimap data to send/recieve with each processor rank.  For this
+  // test, processor p will send to destination d the integer d, in
+  // two subvectors with sqrt(c) and 1 copies, followed by a vector
+  // with 1 copy, iff c := |p-d| is a square number.
+  void fill_vector_data
+    (std::multimap<processor_id_type, std::vector<std::vector<unsigned int>>> & data,
+     int M)
+  {
+    const int rank = TestCommWorld->rank();
+    for (int d=0; d != M; ++d)
+      {
+        int diffsize = std::abs(d-rank);
+        int diffsqrt = std::sqrt(diffsize);
+        if (diffsqrt*diffsqrt == diffsize)
+          {
+            std::vector<std::vector<unsigned int>> vv(2);
+            for (int i=-1; i != diffsqrt; ++i)
+              vv[0].push_back(d);
+            vv[1].push_back(d);
+            data.emplace(d, vv);
+            vv.resize(1);
+            vv[0].resize(1);
+            data.emplace(d, vv);
+          }
+      }
+  }
+
+
+  void testPushImpl(int M)
+  {
+    const int size = TestCommWorld->size(),
+              rank = TestCommWorld->rank();
+
+    std::map<processor_id_type, std::vector<unsigned int> > data, received_data;
+
+    fill_scalar_data(data, M);
+
+    auto collect_data =
+      [&received_data]
+      (processor_id_type pid,
+       const typename std::vector<unsigned int> & data)
+      {
+        auto & vec = received_data[pid];
+        vec.insert(vec.end(), data.begin(), data.end());
+      };
+
+    Parallel::push_parallel_vector_data(*TestCommWorld, data, collect_data);
+
+    // Test the received results, for each processor id p we're in
+    // charge of.
+    std::vector<std::size_t> checked_sizes(size, 0);
+    for (int p=rank; p != M; p += size)
+      for (int srcp=0; srcp != size; ++srcp)
+        {
+          int diffsize = std::abs(srcp-p);
+          int diffsqrt = std::sqrt(diffsize);
+          if (diffsqrt*diffsqrt != diffsize)
+            {
+              if (received_data.count(srcp))
+                {
+                  const std::vector<unsigned int> & datum = received_data[srcp];
+                  CPPUNIT_ASSERT_EQUAL(std::count(datum.begin(), datum.end(), p), std::ptrdiff_t(0));
+                }
+              continue;
+            }
+
+          CPPUNIT_ASSERT_EQUAL(received_data.count(srcp), std::size_t(1));
+          const std::vector<unsigned int> & datum = received_data[srcp];
+          CPPUNIT_ASSERT_EQUAL(std::count(datum.begin(), datum.end(), p), std::ptrdiff_t(diffsqrt+1));
+          checked_sizes[srcp] += diffsqrt+1;
+        }
+
+    for (int srcp=0; srcp != size; ++srcp)
+      CPPUNIT_ASSERT_EQUAL(checked_sizes[srcp], received_data[srcp].size());
+  }
+
+
+  void testPush()
+  {
+    testPushImpl(TestCommWorld->size());
+  }
+
+
+  void testPushOversized()
+  {
+    testPushImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+
+  void testPullImpl(int M)
+  {
+    std::map<processor_id_type, std::vector<unsigned int> > data, received_data;
+
+    fill_scalar_data(data, M);
+
+    auto compose_replies =
+      []
+      (processor_id_type pid,
+       const std::vector<unsigned int> & query,
+       std::vector<unsigned int> & response)
+      {
+        const std::size_t query_size = query.size();
+        response.resize(query_size);
+        for (unsigned int i=0; i != query_size; ++i)
+          response[i] = query[i]*query[i];
+      };
+
+
+    auto collect_replies =
+      [&received_data]
+      (processor_id_type pid,
+       const std::vector<unsigned int> & query,
+       const std::vector<unsigned int> & response)
+      {
+        const std::size_t query_size = query.size();
+        CPPUNIT_ASSERT_EQUAL(query_size, response.size());
+        for (unsigned int i=0; i != query_size; ++i)
+          {
+            CPPUNIT_ASSERT_EQUAL(query[i]*query[i], response[i]);
+          }
+        received_data[pid] = response;
+      };
+
+    // Do the pull
+    unsigned int * ex = nullptr;
+    Parallel::pull_parallel_vector_data
+      (*TestCommWorld, data, compose_replies, collect_replies, ex);
+
+    // Test the received results, for each query we sent.
+    for (int p=0; p != M; ++p)
+      {
+        CPPUNIT_ASSERT_EQUAL(data[p].size(), received_data[p].size());
+        for (auto i : index_range(data[p]))
+          CPPUNIT_ASSERT_EQUAL(data[p][i]*data[p][i], received_data[p][i]);
+      }
+  }
+
+
+  void testPull()
+  {
+    testPullImpl(TestCommWorld->size());
+  }
+
+
+  void testPullOversized()
+  {
+    testPullImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+
+  void testPushVecVecImpl(int M)
+  {
+    const int size = TestCommWorld->size(),
+              rank = TestCommWorld->rank();
+
+    std::map<processor_id_type, std::vector<std::vector<unsigned int>>> data;
+    std::map<processor_id_type, std::vector<unsigned int>> received_data;
+
+    fill_vector_data(data, M);
+
+    auto collect_data =
+      [&received_data]
+      (processor_id_type pid,
+       const typename std::vector<std::vector<unsigned int>> & data)
+      {
+        auto & vec = received_data[pid];
+        vec.insert(vec.end(), data[0].begin(), data[0].end());
+        CPPUNIT_ASSERT_EQUAL(data.size(), std::size_t(2));
+        CPPUNIT_ASSERT_EQUAL(data[1].size(), std::size_t(1));
+        CPPUNIT_ASSERT_EQUAL(data[0][0], data[1][0]);
+      };
+
+    Parallel::push_parallel_vector_data(*TestCommWorld, data, collect_data);
+
+    // Test the received results, for each processor id p we're in
+    // charge of.
+    std::vector<std::size_t> checked_sizes(size, 0);
+    for (int p=rank; p != M; p += size)
+      for (int srcp=0; srcp != size; ++srcp)
+        {
+          int diffsize = std::abs(srcp-p);
+          int diffsqrt = std::sqrt(diffsize);
+          if (diffsqrt*diffsqrt != diffsize)
+            {
+              if (received_data.count(srcp))
+                {
+                  const std::vector<unsigned int> & datum = received_data[srcp];
+                  CPPUNIT_ASSERT_EQUAL(std::count(datum.begin(), datum.end(), p), std::ptrdiff_t(0));
+                }
+              continue;
+            }
+
+          CPPUNIT_ASSERT_EQUAL(received_data.count(srcp), std::size_t(1));
+          const std::vector<unsigned int> & datum = received_data[srcp];
+          CPPUNIT_ASSERT_EQUAL(std::count(datum.begin(), datum.end(), p), std::ptrdiff_t(diffsqrt+1));
+          checked_sizes[srcp] += diffsqrt+1;
+        }
+
+    for (int srcp=0; srcp != size; ++srcp)
+      CPPUNIT_ASSERT_EQUAL(checked_sizes[srcp], received_data[srcp].size());
+  }
+
+
+  void testPushVecVec()
+  {
+    testPushVecVecImpl(TestCommWorld->size());
+  }
+
+
+  void testPushVecVecOversized()
+  {
+    testPushVecVecImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+
+  void testPullVecVecImpl(int M)
+  {
+    std::map<processor_id_type, std::vector<std::vector<unsigned int>>> data;
+    std::map<processor_id_type, std::vector<std::vector<unsigned int>>> received_data;
+
+    fill_vector_data(data, M);
+
+    auto compose_replies =
+      []
+      (processor_id_type pid,
+       const std::vector<std::vector<unsigned int>> & query,
+       std::vector<std::vector<unsigned int>> & response)
+      {
+        const std::size_t query_size = query.size();
+        response.resize(query_size);
+        for (unsigned int i=0; i != query_size; ++i)
+          {
+            const std::size_t query_i_size = query[i].size();
+            response[i].resize(query_i_size);
+            for (unsigned int j=0; j != query_i_size; ++j)
+            response[i][j] = query[i][j]*query[i][j];
+          }
+      };
+
+
+    auto collect_replies =
+      [&received_data]
+      (processor_id_type pid,
+       const std::vector<std::vector<unsigned int>> & query,
+       const std::vector<std::vector<unsigned int>> & response)
+      {
+        const std::size_t query_size = query.size();
+        CPPUNIT_ASSERT_EQUAL(query_size, response.size());
+        for (unsigned int i=0; i != query_size; ++i)
+          {
+            const std::size_t query_i_size = query[i].size();
+            CPPUNIT_ASSERT_EQUAL(query_i_size, response[i].size());
+            for (unsigned int j=0; j != query_i_size; ++j)
+              CPPUNIT_ASSERT_EQUAL(query[i][j]*query[i][j], response[i][j]);
+          }
+        auto & vec = received_data[pid];
+        vec.emplace_back(response[0].begin(), response[0].end());
+        CPPUNIT_ASSERT_EQUAL(response[1].size(), std::size_t(1));
+        CPPUNIT_ASSERT_EQUAL(response[1][0], response[0][0]);
+        vec.emplace_back(response[1].begin(), response[1].end());
+      };
+
+    // Do the pull
+    std::vector<unsigned int> * ex = nullptr;
+    Parallel::pull_parallel_vector_data
+      (*TestCommWorld, data, compose_replies, collect_replies, ex);
+
+    // Test the received results, for each query we sent.
+    for (int p=0; p != M; ++p)
+      {
+        CPPUNIT_ASSERT_EQUAL(data[p].size(), received_data[p].size());
+        for (auto i : index_range(data[p]))
+          for (auto j : index_range(data[p][i]))
+            CPPUNIT_ASSERT_EQUAL(data[p][i][j]*data[p][i][j], received_data[p][i][j]);
+      }
+  }
+
+
+  void testPullVecVec()
+  {
+    testPullVecVecImpl(TestCommWorld->size());
+  }
+
+
+  void testPullVecVecOversized()
+  {
+    testPushVecVecImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+
+  void testPushMultimapImpl(int M)
+  {
+    const int size = TestCommWorld->size(),
+              rank = TestCommWorld->rank();
+
+    // This is going to make sense because of C++11's guarantees
+    // regarding preservation of insert ordering in multimaps,
+    // combined with MPI's guarantees about non-overtaking
+    std::multimap<processor_id_type, std::vector<unsigned int> > data, received_data;
+
+    fill_scalar_data(data, M);
+
+    auto collect_data =
+      [&received_data]
+      (processor_id_type pid,
+       const typename std::vector<unsigned int> & data)
+      {
+        received_data.emplace(pid, data);
+      };
+
+    Parallel::push_parallel_vector_data(*TestCommWorld, data, collect_data);
+
+    // Test the received results, for each processor id p we're in
+    // charge of.
+    std::vector<std::size_t> checked_sizes(size, 0);
+    for (int p=rank; p != M; p += size)
+      for (int srcp=0; srcp != size; ++srcp)
+        {
+          int diffsize = std::abs(srcp-p);
+          int diffsqrt = std::sqrt(diffsize);
+          auto rng = received_data.equal_range(srcp);
+          if (diffsqrt*diffsqrt != diffsize)
+            {
+              for (auto & pv_it : as_range(rng))
+                {
+                  CPPUNIT_ASSERT_EQUAL(std::count(pv_it.second.begin(), pv_it.second.end(), p), std::ptrdiff_t(0));
+                }
+              continue;
+            }
+
+          CPPUNIT_ASSERT(rng.first != rng.second);
+          for (auto pv_it = rng.first; pv_it != rng.second; ++pv_it)
+            {
+              std::ptrdiff_t cnt = std::count(pv_it->second.begin(), pv_it->second.end(), p);
+              if (cnt)
+                {
+                  CPPUNIT_ASSERT_EQUAL(cnt, std::ptrdiff_t(diffsqrt+1));
+                  auto pv_it2 = pv_it; ++pv_it2;
+                  CPPUNIT_ASSERT(pv_it2 != rng.second);
+                  std::ptrdiff_t cnt2 = std::count(pv_it2->second.begin(), pv_it2->second.end(), p);
+                  CPPUNIT_ASSERT_EQUAL(cnt2, std::ptrdiff_t(1));
+                  checked_sizes[srcp] += cnt + cnt2;
+                  break;
+                }
+            }
+        }
+
+    for (int srcp=0; srcp != size; ++srcp)
+      {
+        std::size_t total_size = 0;
+        for (auto & pv_it : as_range(received_data.equal_range(srcp)))
+          total_size += pv_it.second.size();
+        CPPUNIT_ASSERT_EQUAL(checked_sizes[srcp], total_size);
+      }
+  }
+
+
+  void testPushMultimap()
+  {
+    testPushMultimapImpl(TestCommWorld->size());
+  }
+
+
+  void testPushMultimapOversized()
+  {
+    testPushMultimapImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+
+  void testPushMultimapVecVecImpl(int M)
+  {
+    const int size = TestCommWorld->size(),
+              rank = TestCommWorld->rank();
+
+    // This is going to make sense because of C++11's guarantees
+    // regarding preservation of insert ordering in multimaps,
+    // combined with MPI's guarantees about non-overtaking
+    std::multimap<processor_id_type, std::vector<std::vector<unsigned int>>> data, received_data;
+
+    fill_vector_data(data, M);
+
+    auto collect_data =
+      [&received_data]
+      (processor_id_type pid,
+       const typename std::vector<std::vector<unsigned int>> & data)
+      {
+        received_data.emplace(pid, data);
+      };
+
+    Parallel::push_parallel_vector_data(*TestCommWorld, data, collect_data);
+
+    // Test the received results, for each processor id p we're in
+    // charge of.
+    std::vector<std::size_t> checked_sizes(size, 0);
+    for (int p=rank; p != M; p += size)
+      for (int srcp=0; srcp != size; ++srcp)
+        {
+          int diffsize = std::abs(srcp-p);
+          int diffsqrt = std::sqrt(diffsize);
+          auto rng = received_data.equal_range(srcp);
+          if (diffsqrt*diffsqrt != diffsize)
+            {
+              for (auto & pvv : as_range(rng))
+                {
+                  for (auto & v : pvv.second)
+                  CPPUNIT_ASSERT_EQUAL(std::count(v.begin(), v.end(), p), std::ptrdiff_t(0));
+                }
+              continue;
+            }
+
+          CPPUNIT_ASSERT(rng.first != rng.second);
+          for (auto pvv_it = rng.first; pvv_it != rng.second; ++pvv_it)
+            {
+              if(pvv_it->second.size() != std::size_t(2))
+                libmesh_error();
+              CPPUNIT_ASSERT_EQUAL(pvv_it->second.size(), std::size_t(2));
+              std::ptrdiff_t cnt = std::count(pvv_it->second[0].begin(), pvv_it->second[0].end(), p);
+              if (cnt)
+                {
+                  CPPUNIT_ASSERT_EQUAL(cnt, std::ptrdiff_t(diffsqrt+1));
+                  std::ptrdiff_t cnt2 = std::count(pvv_it->second[1].begin(), pvv_it->second[1].end(), p);
+                  CPPUNIT_ASSERT_EQUAL(cnt2, std::ptrdiff_t(1));
+                  auto pvv_it2 = pvv_it; ++pvv_it2;
+                  CPPUNIT_ASSERT(pvv_it2 != rng.second);
+                  CPPUNIT_ASSERT_EQUAL(pvv_it2->second.size(), std::size_t(1));
+                  std::ptrdiff_t cnt3 = std::count(pvv_it2->second[0].begin(), pvv_it2->second[0].end(), p);
+                  CPPUNIT_ASSERT_EQUAL(cnt3, std::ptrdiff_t(1));
+                  checked_sizes[srcp] += cnt + cnt2 + cnt3;
+                  break;
+                }
+              ++pvv_it;
+              libmesh_assert(pvv_it != rng.second);
+            }
+        }
+
+    for (int srcp=0; srcp != size; ++srcp)
+      {
+        std::size_t total_size = 0;
+        for (auto & pvv : as_range(received_data.equal_range(srcp)))
+          for (auto & v : pvv.second)
+            total_size += v.size();
+        CPPUNIT_ASSERT_EQUAL(checked_sizes[srcp], total_size);
+      }
+  }
+
+
+  void testPushMultimapVecVec()
+  {
+    testPushMultimapVecVecImpl(TestCommWorld->size());
+  }
+
+
+  void testPushMultimapVecVecOversized()
+  {
+    testPushMultimapVecVecImpl((TestCommWorld->size() + 4) * 2);
+  }
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION( ParallelSyncTest );