Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6,663 changes: 6,663 additions & 0 deletions coreneuron/logging/easylogging.h

Large diffs are not rendered by default.

477 changes: 477 additions & 0 deletions coreneuron/logging/logging.hpp

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions coreneuron/logging/mpi_communicator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#ifndef _CORENEURON_MPI_COMMUNICATOR_HPP_
#define _CORENEURON_MPI_COMMUNICATOR_HPP_

#include <stdexcept>
#include <vector>
using namespace std;

#include <mpi.h>

#include "coreneuron/interfaces.hpp"
#include "coreneuron/logging.hpp"


/**
* Creates and initializes a new _empty_ `MPI_Status` object
*
* An _empty_ `MPI_Status` is defined as an `MPI_Status` object with `MPI_ERROR` as `MPI_SUCCESS`,
* `MPI_SOURCE` as `MPI_ANY_SOURCE` and `MPI_TAG` as `MPI_ANY_TAG`
* (cf. MPI Standard v3; section 3.7.3).
*
* Rationale: some MPI implementations don't initialize the members of `MPI_Status` correctly
*
* @returns _empty_ `MPI_Status` object
*
* @since v0.5.0
*
* @ingroup Utilities
*/
inline static MPI_Status MPI_Status_factory()
{
MPI_Status stat;
stat.MPI_ERROR = MPI_SUCCESS;
stat.MPI_SOURCE = MPI_ANY_SOURCE;
stat.MPI_TAG = MPI_ANY_TAG;
return stat;
}


namespace coreneuron
{
namespace mpi
{
class MPIError
: public runtime_error
{
public:
MPIError(const string& msg="");
virtual const char* what() const throw();
static MPIError from_code(const int err_code);
};

/**
* Check MPI error code
*
* In case @p err_code is not `MPI_SUCCESS` this throws MPIError with the error code looked up
* to a descriptive string as defined by the MPI implementation.
*
* @param[in] err_code MPI error code as returned from an MPI call
*
* @since v0.5.0
*
* @ingroup Utilities
*/
inline static void check_mpi_error(const int err_code)
{
if (err_code != MPI_SUCCESS) {
throw MPIError::from_code(err_code);
}
}


class MPIStatus;


class MPICommunicator
: public ICommunicator
{
//! @{
int _rank;
int _size;
string _name;
//! @}

public:
//! @{
MPI_Comm comm;
//! @}

//! @{
MPICommunicator();
MPICommunicator(MPI_Comm comm);
//! @}

//! @{
virtual void set_comm(MPI_Comm comm);
virtual int size();
virtual int rank();
virtual string name();
//! @}
};


class MPIStatus
: public IStatus
{
protected:
vector<bool> converged;
MPICommunicator* mpi;

public:
virtual void set_comm(ICommunicator* comm);
virtual void clear() override;
virtual void set_converged(bool converged) override;
virtual bool get_converged(int rank) override;
virtual void post(int tag) override;
virtual void send(int tag) override;
virtual void recv(int tag) override;
};
} // ::coreneuron::mpi
} // ::coreneuron


inline MAKE_LOGGABLE(MPI_Status, mpi_status, os);

#include "coreneuron/mpi_communicator_impl.hpp"

#endif
165 changes: 165 additions & 0 deletions coreneuron/logging/mpi_communicator_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#include "coreneuron/mpi_communicator.hpp"

#include "coreneuron/logging.hpp"


namespace coreneuron
{
namespace mpi
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As in many other places in this PR it's not entirely clear where your choice comes from and what this particular change is there for. There is a lot of documentation / commenting missing

{
MPIError::MPIError(const string& msg)
: runtime_error(msg)
{}

const char* MPIError::what() const throw()
{
return (string("mpi error: ") + string(runtime_error::what())).c_str();
}

MPIError MPIError::from_code(const int err_code)
{
char err_str[MPI_MAX_ERROR_STRING];
int err_len = 0;
int err = MPI_Error_string(err_code, err_str, &err_len);
check_mpi_error(err);
return MPIError("MPI Error: " + string(err_str, err_len) + " (code=" + to_string(err_code) + ")");
}


MPICommunicator::MPICommunicator()
{}

MPICommunicator::MPICommunicator(MPI_Comm comm)
{
set_comm(comm);
}

void MPICommunicator::set_comm(MPI_Comm comm)
{
this->comm = comm;
MPI_Comm_size(this->comm, &(this->_size));
MPI_Comm_rank(this->comm, &(this->_rank));
int len = 0;
char buff[MPI_MAX_OBJECT_NAME];
int err = MPI_Comm_get_name(this->comm, buff, &len);
check_mpi_error(err);
if (len == 0) {
this->_name = string("world");
} else {
this->_name = string(buff, len);
}

shared_ptr<MPIStatus> status = make_shared<MPIStatus>();
this->status = status;
this->status->set_comm(this);
}

int MPICommunicator::size()
{
return this->_size;
}

int MPICommunicator::rank()
{
return this->_rank;
}

string MPICommunicator::name()
{
return this->_name;
}


void MPIStatus::set_comm(ICommunicator* comm)
{
this->comm = comm;
this->converged.resize(comm->size());

this->mpi = dynamic_cast<MPICommunicator*>(comm); assert(this->mpi);
}

void MPIStatus::clear()
{
std::fill(converged.begin(), converged.end(), false);
}

void MPIStatus::set_converged(bool converged)
{
ML_CLOG(DEBUG, "Controller", "set converged for rank " << this->comm->rank() << " to "
<< "'" << boolalpha << converged << "'");
this->converged.at(this->comm->rank()) = converged;
assert(this->converged.at(this->comm->rank()) == converged);
}

bool MPIStatus::get_converged(int rank)
{
return this->converged.at(rank);
}

void MPIStatus::post(int tag)
{
UNUSED(tag);
}

void MPIStatus::send(int tag)
{
// don't send forward if: single processor run, or we're the last processor
if (mpi->size() == 1) { return; }
if (mpi->rank() == mpi->size() - 1) { return; }

int iconverged = converged.at(mpi->rank()) ? IStatus::CONVERGED : IStatus::NOT_CONVERGED;
int dest_rank = (mpi->rank() + 1) % mpi->size();

ML_CLOG(DEBUG, "Controller", "sending converged status to rank " << dest_rank
<< " with tag " << tag << ": "
<< boolalpha << ((bool)iconverged == IStatus::CONVERGED));
int err = MPI_Send(&iconverged, 1, MPI_INT, dest_rank, tag, mpi->comm);
check_mpi_error(err);
ML_CLOG(DEBUG, "Controller", "sent converged status");
}

void MPIStatus::recv(int tag)
{
// don't recv if: single processor run, or we're the first processor
if (mpi->size() == 1) { return; }
if (mpi->rank() == 0) { return; }

if (get_converged(mpi->rank() - 1)) {
ML_CLOG(DEBUG, "Controller", "skipping status recieve as previous is stored as converged");
return;
}

MPI_Status stat = MPI_Status_factory();
int iconverged = IStatus::NOT_CONVERGED;
int src_rank = (mpi->rank() - 1) % mpi->size();
ML_CLOG(DEBUG, "Controller", "receiving converged status from rank " << src_rank
<< " with tag '1'");
int err = MPI_Recv(&iconverged, 1, MPI_INT, src_rank, tag, mpi->comm, &stat);
check_mpi_error(err);
ML_CLOG(DEBUG, "Controller", "received converged status from rank " << src_rank
<< " with tag "<< tag << ": "
<< boolalpha << ((bool)iconverged == IStatus::CONVERGED));

converged.at(mpi->rank() - 1) = (iconverged == IStatus::CONVERGED) ? true : false;
}
} // ::coreneuron::mpi
} // ::coreneuron


MAKE_LOGGABLE(MPI_Status, mpi_status, os)
{
if ( mpi_status.MPI_TAG == MPI_ANY_TAG
&& mpi_status.MPI_SOURCE == MPI_ANY_SOURCE
&& mpi_status.MPI_ERROR == MPI_SUCCESS) {
os << "MPI_Status(empty)";
} else {
char err_str[MPI_MAX_ERROR_STRING];
int err_len = 0;
int err = MPI_Error_string(mpi_status.MPI_ERROR, err_str, &err_len);
coreneuron::mpi::check_mpi_error(err);
os << "MPI_Status(source=" << to_string(mpi_status.MPI_SOURCE) << ", "
<< "tag=" << to_string(mpi_status.MPI_TAG) << ", "
<< "error=" << string(err_str, err_len) << ")";
}
return os;
}
6 changes: 3 additions & 3 deletions coreneuron/nrniv/main1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ THE POSSIBILITY OF SUCH DAMAGE.
#include "coreneuron/nrniv/multisend.h"
#include "coreneuron/utils/file_utils.h"
#include "coreneuron/nrniv/nrn2core_direct.h"
#include "coreneuron/config.hpp"
#include "coreneuron/logging.hpp"
#include <string.h>
#include <climits>

Expand Down Expand Up @@ -491,9 +493,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
double tstop = nrnopt_get_dbl("--tstop");

if (tstop < t && nrnmpi_myid == 0) {
printf("Error: Stop time (%lf) < Start time (%lf), restoring from checkpoint? \n",
tstop, t);
abort();
ML_LOG(ERROR, "Error: Stop time" << tstop << "Start time" << "restoring from checkpoint? \n";
}

// In direct mode there are likely trajectory record requests
Expand Down
5 changes: 2 additions & 3 deletions coreneuron/nrniv/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,11 +375,10 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
d_ptr = (int*)acc_copyin(info->cellsize, sizeof(int) * info->nwarp);
acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
} else {
printf("\n ERROR: only --cell_permute = [12] implemented");
abort();
ML_LOG(ERROR, "\n ERROR: only --cell_permute = [12] implemented");
}
} else {
printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
ML_LOG(INFO, "\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
}
}

Expand Down
4 changes: 2 additions & 2 deletions coreneuron/nrniv/tqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,8 @@ void splay(SPBLK* n, SPTREE* q) {
#ifdef DEBUG
if (q->root != prev) {
/* fprintf(stderr, " *** bug in splay: n not in q *** " ); */
abort();
}
ML_LOG(FATAL, "fatal error");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bad indenting...

#endif

n->leftlink = left;
Expand Down
2 changes: 1 addition & 1 deletion coreneuron/nrnmpi/nrnmpi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ void nrn_abort(int errcode) {
} else
#endif
{
abort();
ML_LOG(FATAL, "fatal error");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The program will still need to be aborted.

}
}

Expand Down