diff --git a/include/graphblas/algorithms/conjugate_gradient.hpp b/include/graphblas/algorithms/conjugate_gradient.hpp index 11e910566..157afd4cf 100644 --- a/include/graphblas/algorithms/conjugate_gradient.hpp +++ b/include/graphblas/algorithms/conjugate_gradient.hpp @@ -314,43 +314,77 @@ namespace grb { std::cerr << "Error: at least one CG iteration must be requested\n"; return grb::ILLEGAL; } + + // dense descriptor + if( descr & grb::descriptors::dense ) { + if( grb::nnz( x ) != grb::size( x ) ) { + std::cerr << "Error: x was sparse while the dense descriptor was given\n"; + return grb::ILLEGAL; + } + if( grb::nnz( b ) != grb::size( b ) ) { + std::cerr << "Error: b was sparse while the dense descriptor was given\n"; + return grb::ILLEGAL; + } + if( grb::nnz( r ) != grb::size( r ) ) { + std::cerr << "Error: r was sparse while the dense descriptor was given\n"; + return grb::ILLEGAL; + } + if( grb::nnz( temp ) != grb::size( temp ) ) { + std::cerr << "Error: temp was sparse while the dense descriptor was " + << "given\n"; + return grb::ILLEGAL; + } + if( preconditioned && + (grb::nnz( temp_precond ) != grb::size( temp_precond )) + ) { + std::cerr << "Error: temp_precond was sparse while the dense descriptor " + << "was given\n"; + return grb::ILLEGAL; + } + } } // set pure output fields to neutral defaults iterations = 0; residual = std::numeric_limits< double >::infinity(); - // make x and b structurally dense (if not already) so that the remainder + // declare internal scalars + IOType sigma, bnorm, alpha, beta; + + // make x structurally dense (if not already) so that the remainder // algorithm can safely use the dense descriptor for faster operations - { - RC rc = grb::SUCCESS; + grb::RC ret = grb::SUCCESS; + if( !(descr & grb::descriptors::dense) ) { if( nnz( x ) != n ) { - rc = grb::set< descriptors::invert_mask | descriptors::structural >( + ret = grb::set< descriptors::invert_mask | descriptors::structural >( x, x, zero ); + assert( ret == grb::SUCCESS ); } - if( rc != grb::SUCCESS ) { return rc; } assert( nnz( x ) == n ); } - IOType sigma, bnorm, alpha, beta; - // r = b - temp; - grb::RC ret = grb::set( temp, 0 ); assert( ret == grb::SUCCESS ); + ret = ret ? ret : grb::set< descr >( temp, 0 ); + assert( ret == grb::SUCCESS ); ret = ret ? ret : grb::mxv< descr_dense >( temp, A, x, ring ); assert( ret == grb::SUCCESS ); - ret = ret ? ret : grb::set( r, zero ); assert( ret == grb::SUCCESS ); - // note: no dense descriptor since we actually allow sparse b - ret = ret ? ret : grb::foldl( r, b, ring.getAdditiveMonoid() ); - // from here onwards, r, temp, x are dense and will remain so - assert( nnz( r ) == n ); - assert( nnz( temp ) == n ); + ret = ret ? ret : grb::set< descr >( r, zero ); + assert( ret == grb::SUCCESS ); + // note: no forced dense descriptor since we actually allow sparse b + // but if b was already dense and the dense descriptor was given, we + // will respect that + ret = ret ? ret : grb::foldl< descr >( r, b, ring.getAdditiveMonoid() ); + // note: at this point, r and temp are guaranteed dense, so we force a dense + // descriptor ret = ret ? ret : grb::foldl< descr_dense >( r, temp, minus ); assert( ret == grb::SUCCESS ); // bnorm = b' * b; + // Note that b can be structurally sparse (unless otherwise guaranteed by the + // user). bnorm = zero; - ret = ret ? ret : grb::dot< descr_dense >( + ret = ret ? ret : grb::dot< descr >( bnorm, b, b, ring.getAdditiveMonoid(), @@ -381,7 +415,9 @@ namespace grb { // z = M^-1r if( preconditioned ) { - ret = ret ? ret : grb::set( z, 0 ); // also ensures z is dense, henceforth + ret = ret ? ret : grb::set< descr >( z, 0 ); + // henceforth, also z is structurally dense and we can force a dense + // descriptor for it if( preconditioned == 2 ) { ret = ret ? ret : grb::wait( z, r ); } @@ -389,10 +425,12 @@ namespace grb { } // else, z equals r (by reference) // u = z; - ret = ret ? ret : grb::set( u, z ); + ret = ret ? ret : grb::set< descr >( u, z ); assert( ret == grb::SUCCESS ); - // from here onwards, u is dense; i.e., all vectors are dense from now on, - // and we can freely use the dense descriptor in the subsequent + // henceforth, also u is structurally dense -- in fact, henceforth, all + // vectors in this algorithm except b are now guaranteed dense. We may + // hence liberally force the use of the dense descriptor in the main while + // loop of the CG (since it does not refer to b). // sigma = r' * z; sigma = zero; @@ -435,7 +473,7 @@ namespace grb { assert( ret == grb::SUCCESS ); // alpha = sigma / beta; - ret = ret ? ret : grb::apply( alpha, sigma, beta, divide ); + ret = ret ? ret : grb::apply< descr >( alpha, sigma, beta, divide ); assert( ret == grb::SUCCESS ); // x = x + alpha * u; @@ -489,7 +527,7 @@ namespace grb { } // alpha = beta / sigma; - ret = ret ? ret : grb::apply( alpha, beta, sigma, divide ); + ret = ret ? ret : grb::apply< descr >( alpha, beta, sigma, divide ); assert( ret == grb::SUCCESS ); // u_next = z + beta * u_previous; diff --git a/include/transition/solver.h b/include/transition/solver.h index ba1afa008..09e4f2acc 100644 --- a/include/transition/solver.h +++ b/include/transition/solver.h @@ -225,6 +225,22 @@ typedef int (*sparse_cg_preconditioner_dxx_t) ( * @param[in] a The nonzero values of the system matrix. * @param[in] ja The column indices of the nonzeroes of the system matrix. * @param[in] ia The row offset array of the system matrix. + * @param[in] precon Whether the CG handle is expected to execute solves with + * preconditioning. + * @param[in] numa Whether the CG handle is expected to execute solves that + * employ more than one NUMA domain, in which case any memory + * allocated during construction of this handle may attempt + * to employ NUMA-aware allocation. + * + * The Boolean flags \a precon and \a numa indicated above are \em hints to the + * underlying ALP implementation. + * + * \note This means that even if \a precon was set to false during + * handle creation, a user may still request preconditioning and expect + * no error. Similarly, even if \a numa was set true during + * handle creation, the underlying ALP implementation may still opt to + * \em not employ NUMA-aware allocation, for example, if it was configured + * without libnuma support. * * This variant is for single-precision floating point nonzeroes and integer * \a ja and \a ia arrays, as also indicated by the sii postfix. @@ -244,25 +260,119 @@ typedef int (*sparse_cg_preconditioner_dxx_t) ( * @returns #NO_ERROR If initialisation of the handle proceeded * successfully. Only in this case shall \a handle * henceforth be a \em valid handle. + * + * On returning #NO_ERROR, \a handle shall correspond to an initialised (P)CG + * solver instance. Note that this handle contains workspace memory that are + * necessary for auxiliary vectors required by the CG algorithm. When + * preconditioning is employed, four such workspace vectors of size \a n are + * required; otherwise, only three such vectors are required. This memory will + * be freed on a call to #sparse_cg_destroy_sii. + */ +sparse_err_t sparse_cg_init_opt_sii( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const int * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_sii that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_sii for full details. */ sparse_err_t sparse_cg_init_sii( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const int * const ja, const int * const ia ); +/** + * Variant of #sparse_cg_init_opt_sii that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * This variant of initialisation guarantees no additional dynamic memory + * allocation will be performed. A call to #sparse_cg_destroy_sii will likewise + * \em not free up the given \a workspace memory. + * + * @param[in,out] workspace The user-supplied memory that will be used as the + * work space for the returned PCG solver \a handle. + * + * Previous contents of the given \a workspace will be ignored, and, when given + * to this function, should be considered lost. On a successful call, the given + * memory should be considered to be under the sole custody of the PCG solver + * \a handle until the time the handle is destroyed; at that point, the + * ownership of the given memory will be returned to the callee. If the call to + * this function is not successful, the ownership of the memory region returns + * to the callee immediately. For as long as ownership of the memory region is + * not with the callee, the contents of the memory region are undefined while + * code not controlled by ALP is not allowed to write into the memory region. + * + * @param[in] workspace_size The size of the memory pointed to by \a workspace. + * + * The given workspace must have byte size greater than or equal to the value + * returned by a call to #sparse_cg_workspace_size_s. + * + * \note For variants that use double-precision values, the workspace must have + * byte size greater than or equal to the value returned by a call to + * #sparse_cg_workspace_size_d. + * + * \note Note that the hints that can be supplied to #sparse_cg_init_opt_sii + * affect its internal workspace allocation mechanisms. Therefore, those + * hints do not apply to this manual PCG solver initialisation variant. + * + * In addition to the possible return codes defined by #sparse_cg_init_opt_sii, + * a call to this variant may furthermore return: + * + * @returns #NULL_ARGUMENT If \a workspace equals NULL. + * @returns #ILLEGAL_ARGUMENT If \a workspace_size is smaller than required. + * + * Different from the specification of #sparse_cg_init_opt_sii, this variant + * may never return #OUT_OF_MEMORY. + * + * For further details, see #sparse_cg_init_opt_sii. + */ +sparse_err_t sparse_cg_manual_init_sii( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const int * const ia, + void * workspace, const size_t workspace_size +); + /** * Initialises a #sparse_cg_handle_t object. * * This variant is for double-precision floating point nonzeroes and integer * \a ja and \a ia arrays, as also indicated by the dii postfix. * - * @see #sparse_cg_init_sii for full documentation. + * @see #sparse_cg_init_opt_sii for full documentation. + */ +sparse_err_t sparse_cg_init_opt_dii( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const int * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_dii that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_dii for full details. */ sparse_err_t sparse_cg_init_dii( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const int * const ja, const int * const ia ); +/** + * Variant of #sparse_cg_init_opt_dii that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * @see #sparse_cg_manual_init_sii for full details. + */ +sparse_err_t sparse_cg_manual_init_dii( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const int * const ia, + void * workspace, const size_t workspace_size +); + /** * Initialises a #sparse_cg_handle_t object. * @@ -270,13 +380,37 @@ sparse_err_t sparse_cg_init_dii( * size_t-valued \a ja, and integer-valued \a ia, as also indicated by * the siz postfix. * - * @see #sparse_cg_init_sii for full documentation. + * @see #sparse_cg_init_opt_sii for full documentation. + */ +sparse_err_t sparse_cg_init_opt_siz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const size_t * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_siz that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_siz for full details. */ sparse_err_t sparse_cg_init_siz( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const int * const ja, const size_t * const ia ); +/** + * Variant of #sparse_cg_init_opt_siz that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * @see #sparse_cg_manual_init_sii for full details. + */ +sparse_err_t sparse_cg_manual_init_siz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +); + /** * Initialises a #sparse_cg_handle_t object. * @@ -284,13 +418,37 @@ sparse_err_t sparse_cg_init_siz( * size_t-valued \a ja, and integer-valued \a ia, as also indicated by * the diz postfix. * - * @see #sparse_cg_init_sii for full documentation. + * @see #sparse_cg_init_opt_sii for full documentation. + */ +sparse_err_t sparse_cg_init_opt_diz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const size_t * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_diz that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_diz for full details. */ sparse_err_t sparse_cg_init_diz( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const int * const ja, const size_t * const ia ); +/** + * Variant of #sparse_cg_init_opt_siz that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * @see #sparse_cg_manual_init_sii for full details. + */ +sparse_err_t sparse_cg_manual_init_diz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +); + /** * Initialises a #sparse_cg_handle_t object. * @@ -298,13 +456,37 @@ sparse_err_t sparse_cg_init_diz( * size_t-valued \a ja and \a ia, as also indicated by the szz * postfix. * - * @see #sparse_cg_init_sii for full documentation. + * @see #sparse_cg_init_opt_sii for full documentation. + */ +sparse_err_t sparse_cg_init_opt_szz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const size_t * const ja, const size_t * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_szz that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_szz for full details. */ sparse_err_t sparse_cg_init_szz( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const size_t * const ja, const size_t * const ia ); +/** + * Variant of #sparse_cg_init_opt_siz that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * @see #sparse_cg_manual_init_sii for full details. + */ +sparse_err_t sparse_cg_manual_init_szz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const size_t * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +); + /** * Initialises a #sparse_cg_handle_t object. * @@ -312,13 +494,89 @@ sparse_err_t sparse_cg_init_szz( * size_t-valued \a ja and \a ia, as also indicated by the dzz * postfix. * - * @see #sparse_cg_init_sii for full documentation. + * @see #sparse_cg_init_opt_sii for full documentation. + */ +sparse_err_t sparse_cg_init_opt_dzz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const size_t * const ja, const size_t * const ia, + const bool precon, const bool numa +); + +/** + * Variant of #sparse_cg_init_opt_dzz that results in a #sparse_cg_handle_t with + * default hints (with preconditioning and with NUMA). + * + * @see sparse_cg_init_opt_dzz for full details. */ sparse_err_t sparse_cg_init_dzz( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const size_t * const ja, const size_t * const ia ); +/** + * Variant of #sparse_cg_init_opt_siz that does not allocate workspace, but + * instead employs given user memory as its workspace memory. + * + * @see #sparse_cg_manual_init_sii for full details. + */ +sparse_err_t sparse_cg_manual_init_dzz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const size_t * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +); + +/** + * Returns the minimum workspace size required when manually supplying a + * workspace for PCG solvers to initialise with. + * + * Those so-called manual initialisation functions are the following: + * - #sparse_cg_manual_init_sii, + * - #sparse_cg_manual_init_siz, and + * - #sparse_cg_manual_init_szz. + * This is the variant for PCG solvers that operate on single-precision values. + * + * @param[in] n The maximum system size the solver will be initialised + * with. + * @param[in] precon Whether the solver is expected to use a preconditioner. + * + * \note If preconditioning is required, the solver requires a larger work + * space. Note, however, that even if initially not enough workspace is + * supplied for a preconditioned solve, nonetheless requesting a + * preconditioned solve will still execute -- it will then dynamically + * allocate memory on the fly. + * + * A call to this function never fails. + * + * @returns The required workspace size in bytes. + */ +size_t sparse_cg_workspace_size_s( const size_t n, const bool precon ); + +/** + * Returns the minimum workspace size required when manually supplying a + * workspace for PCG solvers to initialise with. + * + * Those so-called manual initialisation functions are the following: + * - #sparse_cg_manual_init_dii, + * - #sparse_cg_manual_init_diz, and + * - #sparse_cg_manual_init_dzz. + * This is the variant for PCG solvers that operate on double-precision values. + * + * @param[in] n The maximum system size the solver will be initialised + * with. + * @param[in] precon Whether the solver is expected to use a preconditioner. + * + * \note If preconditioning is required, the solver requires a larger work + * space. Note, however, that even if initially not enough workspace is + * supplied for a preconditioned solve, nonetheless requesting a + * preconditioned solve will still execute -- it will then dynamically + * allocate memory on the fly. + * + * A call to this function never fails. + * + * @returns The required workspace size in bytes. + */ +size_t sparse_cg_workspace_size_d( const size_t n, const bool precon ); + // Note that szi and dzi are skipped on purpose. Such variants would not seem // sensible, though could easily be provided if they do turn out to be needed diff --git a/src/transition/kml_iss.cpp b/src/transition/kml_iss.cpp index 2722f4450..5c0fa2d0e 100644 --- a/src/transition/kml_iss.cpp +++ b/src/transition/kml_iss.cpp @@ -140,7 +140,7 @@ int KML_CG_PREFIXED( InitSI )( KmlSolverTask **handle, int n, const float *a, const int *ja, const int *ia ) { - // negative numbers become positive when casted to size_t + // negative numbers become positive when cast to size_t if( n <= 0 ) { return KMLSS_BAD_N; } sparse_err_t err = sparse_cg_init_sii( handle, n, a, ja, ia ); return sparse_err_t_2_int( err ); diff --git a/src/transition/solver.cpp b/src/transition/solver.cpp index b52baca3d..b0a93da36 100644 --- a/src/transition/solver.cpp +++ b/src/transition/solver.cpp @@ -16,12 +16,15 @@ */ -#include #include +#include #include #include +#include +#include + #include "solver.h" /** @@ -34,6 +37,87 @@ */ /** + * Implements the various ways the CG workspace data may (or may not) be freed. + * + * This deleter at present considers that + * -# the solver does not own the workspace data (and hence may not free it); + * -# the solver has allocated the workspace data using standard C++ array + * allocation mechanisms, for which this class provides the matching + * standard C++ array deleter; and + * -# the solver has allocated the workspace data using libNUMA, for which this + * class provides a call to the matching libNUMA free. + */ +class CGWorkspaceDeleter { + + private: + + /** Whether we own the workspace data. */ + const bool owning; + + /** + * Whether the workspace data was allocated NUMA-aware, and if so, what the + * allocation size was. + * + * \note The allocation size is required for a call to the libNUMA free + * function. + */ + const size_t numa; + + + public: + + /** Default construction is not allowed. */ + CGWorkspaceDeleter() = delete; + + /** + * Explicit constructor that requires all information necessary to determine + * how the workspace should be deleted. + * + * @param[in] owning_in Whether the solver owns the workspace buffer. + * @param[in] numa_in Zero if libNUMA was not used for allocating the + * workspace bufer, and the allocation size otherwise. + * + * If \a owning_in is false, the input to \a numa_in is ignored. + * + * \note In that case, the user is reponsible for freeing memory, which indeed + * could have been done in a myriad of other ways besides those this + * deleter knows about. + * + * \internal In case ALP is compiled without libNUMA support, the standard C++ + * array mechanism will be assumed always. + */ + CGWorkspaceDeleter( const bool &owning_in, const size_t &numa_in ) noexcept : + owning( owning_in ), numa( numa_in ) + {} + + /** + * Implements the correct action for freeing the workspace data. + * + * @param[in] ptr Pointer to the workspace memory to be deleted. + */ + void operator()( void * ptr ) { + assert( !( !owning && numa ) ); + if( owning ) { +#ifndef _GRB_NO_LIBNUMA + if( numa ) { + numa_free( ptr, numa ); + } else { +#endif + delete [] static_cast< char * >(ptr); +#ifndef _GRB_NO_LIBNUMA + } +#endif + } + } + +}; + +/** + * The class behind the PCG solver handle. + * + * More precisely, a valid PCG solver handle is a C void pointer to an instance + * of this class. + * * @tparam T The nonzero value type. * @tparam NZI The nonzero index type. * @tparam RSI The row and column index type. @@ -106,6 +190,14 @@ class CG_Data { /** Any required data for the \a preconditioner. */ void * preconditioner_data; + // destructor data + + /** The given buffer during construction. */ + void * _buffer; + + /** How to handle \a _buffer on destruction. */ + CGWorkspaceDeleter _buffer_deleter; + protected: @@ -136,6 +228,36 @@ class CG_Data { public: + /** + * The size, in bytes, required as a work space for the PCG algorithm. + * + * @param[in] n The linear system size. + * @param[in] preconditioned Whether the solver is expected to be called with + * a preconditioner. + * + * @returns The required size, in bytes. + * + * \internal Note that the space for two additional integers is required as + * per both the C and C++ specifications. However, we here also + * prevent false sharing between different workspace vectors, and so + * actually align on the cache line size boundary. + * + * \warning The current implementation assumes that the cache line size is a + * multiple of sizeof(int). If this assertion ever fails, + * an error will be thrown at compile time (static_assert). + */ + static size_t workspaceSize( + const size_t n, const bool preconditioned + ) noexcept { + static_assert( grb::config::CACHE_LINE_SIZE::value() >= sizeof(int), + "Unhandled padding case; please submit a bug report" ); + if( preconditioned ) { + return 4 * n * sizeof( T ) + 3 * grb::config::CACHE_LINE_SIZE::value(); + } else { + return 3 * n * sizeof( T ) + 2 * grb::config::CACHE_LINE_SIZE::value(); + } + } + /** Disable default constructor. */ CG_Data() = delete; @@ -149,25 +271,93 @@ class CG_Data { * * The matrix defined by \a a, \a ja, \a ia must be symmetric positive * definite. + * + * @param[in] buffer The workspace required by this algorithm. This must + * point to a valid memory region that can be used + * exclusively by ALP. The size of the memory region (in + * bytes) must be greater or equal to that returned by + * #workspaceSize( n ) with \a n equal to the value passed + * to this constructor. + * + * @param[in] buffer_size The size of the given \a buffer. Used for sanity + * checking the intended use of the buffer. + * + * @param[in] buffer_deleter How the given \a buffer should be deleted on + * destruction of this instance. + * + * \note If the buffer is not to be owned by this instance, then a non-owning + * \a buffer_deleter should be given; see #CGWorkspaceDeleter for + * details. + * + * \internal The sanity check is weaker if not compiled in debug mode so + * should be done explicitly \em before entering this constructor. + * + * \internal This constructor may throw exceptions. */ CG_Data( const size_t n, - const T * const a, const RSI * const ja, const NZI * const ia + const T * const a, const RSI * const ja, const NZI * const ia, + void * const buffer, const size_t buffer_size, const CGWorkspaceDeleter &buffer_deleter ) : size( n ), tolerance( 1e-5 ), max_iter( 1000 ), matrix( 0, 0 ), residual( std::numeric_limits< T >::infinity() ), iters( 0 ), - workspace( { - grb::Vector< T >( n ), grb::Vector< T >( n ), grb::Vector< T >( n ) - } ), precond_workspace( grb::Vector< T >( 0 ) ), - preconditioner( nullptr ), preconditioner_data( nullptr ) + preconditioner( nullptr ), preconditioner_data( nullptr ), + _buffer( buffer ), _buffer_deleter( buffer_deleter ) { assert( n > 0 ); assert( a != nullptr ); assert( ja != nullptr ); assert( ia != nullptr ); + constexpr size_t L = grb::config::CACHE_LINE_SIZE::value(); + constexpr size_t align = (L % sizeof(int) == 0) + ? L + : (L + (sizeof(int) - (L % sizeof(int)))); + if( buffer_size < workspaceSize( n, false ) ) { + throw std::invalid_argument( "The given buffer size is too small" ); + } Matrix A = grb::internal::wrapCRSMatrix( a, ja, ia, n, n ); std::swap( A, matrix ); + char * workspace_ptr = static_cast< char * >(buffer); + assert( static_cast< char * >(buffer) + buffer_size >= workspace_ptr + n ); + { + T * const workspace_vector = reinterpret_cast< T * >(workspace_ptr); + grb::Vector< T > tmp = grb::internal::wrapRawVector( n, workspace_vector ); + std::swap( workspace[ 0 ], tmp ); + } + workspace_ptr += n * sizeof( T ); + workspace_ptr += + (align - (reinterpret_cast(workspace_ptr) % align)); + assert( static_cast< char * >(buffer) + buffer_size >= workspace_ptr + n ); + { + T * const workspace_vector = reinterpret_cast< T * >(workspace_ptr); + grb::Vector< T > tmp = grb::internal::wrapRawVector( n, workspace_vector ); + std::swap( workspace[ 1 ], tmp ); + } + workspace_ptr += n * sizeof( T ); + workspace_ptr += + (align - (reinterpret_cast(workspace_ptr) % align)); + assert( static_cast< char * >(buffer) + buffer_size >= workspace_ptr + n ); + { + T * const workspace_vector = reinterpret_cast< T * >(workspace_ptr); + grb::Vector< T > tmp = grb::internal::wrapRawVector( n, workspace_vector ); + std::swap( workspace[ 2 ], tmp ); + } + if( buffer_size >= workspaceSize( n, true ) ) { + workspace_ptr += n * sizeof( T ); + workspace_ptr += + (align - (reinterpret_cast(workspace_ptr) % align)); + assert( static_cast< char * >(buffer) + buffer_size >= workspace_ptr + n ); + T * const workspace_vector = reinterpret_cast< T * >(workspace_ptr); + grb::Vector< T > tmp = grb::internal::wrapRawVector( n, workspace_vector ); + std::swap( precond_workspace, tmp ); + } + } + + /** Destroys this CG solve handle. */ + ~CG_Data() { + // call the deleter on the buffer pointer (which may be a no-op) + _buffer_deleter( _buffer ); } /** @returns The system size. */ @@ -231,8 +421,15 @@ class CG_Data { preconditioner_data = data; assert( !( !preconditioner && preconditioner_data ) ); if( grb::size( precond_workspace ) == 0 ) { + // note -- using the ALP default vector allocation here is the only + // convenient option: otherwise has to implement buffer management + // for this vector only. If it is important that this vector be allocated + // without a SPA, then the user should set the precond hint to true. grb::Vector< T > replace( size ); std::swap( replace, precond_workspace ); + std::cerr << "Warning: allocating additional workspace to handle CG solves " + << "with preconditioning. To prevent this on-demand allocation, set the " + << "precond option during CG solver handle creation to true.\n"; } assert( grb::size( precond_workspace ) == size ); } @@ -277,15 +474,20 @@ class CG_Data { template< typename T, typename NZI, typename RSI > static sparse_err_t sparse_cg_init_impl( sparse_cg_handle_t * const handle, const size_t n, - const T * const a, const RSI * const ja, const NZI * const ia + const T * const a, const RSI * const ja, const NZI * const ia, + void * const buffer, const size_t bufferSize, const CGWorkspaceDeleter &deleter ) { if( n == 0 ) { return ILLEGAL_ARGUMENT; } if( handle == nullptr || a == nullptr || ja == nullptr || ia == nullptr ) { return NULL_ARGUMENT; } + if( buffer == nullptr ) { return NULL_ARGUMENT; } + if( bufferSize < CG_Data< T, NZI, RSI >::workspaceSize( n, false ) ) { + return ILLEGAL_ARGUMENT; + } try { - *handle = static_cast< void * >( - new CG_Data< T, NZI, RSI >( n, a, ja, ia ) ); + *handle = static_cast< void * >( new CG_Data< T, NZI, RSI >( + n, a, ja, ia, buffer, bufferSize, deleter ) ); } catch( std::exception &e ) { // the grb::Matrix constructor may only throw on out of memory errors std::cerr << "Error: " << e.what() << "\n"; @@ -295,46 +497,215 @@ static sparse_err_t sparse_cg_init_impl( return NO_ERROR; } +size_t sparse_cg_workspace_size_s( const size_t n, const bool precon ) { + return CG_Data< float, size_t, size_t >::workspaceSize( n, precon ); +} + +size_t sparse_cg_workspace_size_d( const size_t n, const bool precon ) { + return CG_Data< double, size_t, size_t >::workspaceSize( n, precon ); +} + +template< typename T, typename NZI, typename RSI > +static sparse_err_t sparse_cg_init_impl_no_buffer( + sparse_cg_handle_t * const handle, const size_t n, + const T * const a, const RSI * const ja, const NZI * const ia, + const bool support_preconditioning, const bool numa +) { + const size_t allocSize = CG_Data< T, NZI, RSI >:: + workspaceSize( n, support_preconditioning ); + void * buffer = nullptr; +#ifdef _GRB_NO_LIBNUMA + if( numa ) { + std::cerr << "Warning: solver transition path was requested to perform " + << "NUMA-aware allocation, but ALP was compiled without libnuma.\n"; + } +#else + if( numa ) { + buffer = numa_alloc_interleaved( allocSize ); + if( buffer == NULL ) { return OUT_OF_MEMORY; } + } else { +#endif + try { + buffer = static_cast< void * >(new char[ allocSize ]); + } catch( ... ) { + std::cerr << "Error allocating workspace buffer\n"; + return OUT_OF_MEMORY; + } +#ifndef _GRB_NO_LIBNUMA + } +#endif + const sparse_err_t rc = sparse_cg_init_impl( + handle, n, a, ja, ia, buffer, allocSize, + CGWorkspaceDeleter( true, numa ? allocSize : 0 ) + ); + if( rc != NO_ERROR ) { +#ifndef _GRB_NO_LIBNUMA + if( numa ) { + numa_free( buffer, allocSize ); + } else { +#endif + delete [] static_cast< char * >(buffer); +#ifndef _GRB_NO_LIBNUMA + } +#endif + } + return rc; +} + +sparse_err_t sparse_cg_manual_init_sii( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const int * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< float, int, int >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_manual_init_dii( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const int * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< double, int, int >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_manual_init_siz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< float, size_t, int >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_manual_init_diz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< double, size_t, int >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_manual_init_szz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const size_t * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< float, size_t, size_t >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_manual_init_dzz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const size_t * const ja, const size_t * const ia, + void * workspace, const size_t workspace_size +) { + return sparse_cg_init_impl< double, size_t, size_t >( handle, n, a, ja, ia, + workspace, workspace_size, CGWorkspaceDeleter( false, false ) ); +} + +sparse_err_t sparse_cg_init_opt_sii( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const int * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< float, int, int >( handle, n, a, ja, ia, + precond, numa ); +} + +sparse_err_t sparse_cg_init_opt_dii( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const int * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< double, int, int >( handle, n, a, ja, ia, + precond, numa ); +} + +sparse_err_t sparse_cg_init_opt_siz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const int * const ja, const size_t * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< float, size_t, int >( handle, n, a, ja, + ia, precond, numa ); +} + +sparse_err_t sparse_cg_init_opt_diz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const int * const ja, const size_t * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< double, size_t, int >( handle, n, a, ja, + ia, precond, numa ); +} + +sparse_err_t sparse_cg_init_opt_szz( + sparse_cg_handle_t * const handle, const size_t n, + const float * const a, const size_t * const ja, const size_t * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< float, size_t, size_t >( handle, n, a, ja, + ia, precond, numa ); +} + +sparse_err_t sparse_cg_init_opt_dzz( + sparse_cg_handle_t * const handle, const size_t n, + const double * const a, const size_t * const ja, const size_t * const ia, + const bool precond, const bool numa +) { + return sparse_cg_init_impl_no_buffer< double, size_t, size_t >( handle, n, a, ja, + ia, precond, numa ); +} + sparse_err_t sparse_cg_init_sii( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const int * const ja, const int * const ia ) { - return sparse_cg_init_impl< float, int, int >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< float, int, int >( handle, n, a, ja, ia, + true, true ); } sparse_err_t sparse_cg_init_dii( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const int * const ja, const int * const ia ) { - return sparse_cg_init_impl< double, int, int >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< double, int, int >( handle, n, a, ja, ia, + true, true ); } sparse_err_t sparse_cg_init_siz( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const int * const ja, const size_t * const ia ) { - return sparse_cg_init_impl< float, size_t, int >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< float, size_t, int >( handle, n, a, ja, + ia, true, true ); } sparse_err_t sparse_cg_init_diz( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const int * const ja, const size_t * const ia ) { - return sparse_cg_init_impl< double, size_t, int >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< double, size_t, int >( handle, n, a, ja, + ia, true, true ); } sparse_err_t sparse_cg_init_szz( sparse_cg_handle_t * const handle, const size_t n, const float * const a, const size_t * const ja, const size_t * const ia ) { - return sparse_cg_init_impl< float, size_t, size_t >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< float, size_t, size_t >( handle, n, a, ja, + ia, true, true ); } sparse_err_t sparse_cg_init_dzz( sparse_cg_handle_t * const handle, const size_t n, const double * const a, const size_t * const ja, const size_t * const ia ) { - return sparse_cg_init_impl< double, size_t, size_t >( handle, n, a, ja, ia ); + return sparse_cg_init_impl_no_buffer< double, size_t, size_t >( handle, n, a, ja, + ia, true, true ); } template< typename T, typename NZI, typename RSI > @@ -525,7 +896,8 @@ static sparse_err_t sparse_cg_get_iter_count_impl( const sparse_cg_handle_t handle, size_t * const iters ) { if( handle == nullptr || iters == nullptr ) { return NULL_ARGUMENT; } - *iters = static_cast< CG_Data< T, NZI, RSI > * >( handle )->getIters(); + *iters = + static_cast< CG_Data< T, NZI, RSI > * >( handle )->getIters(); return NO_ERROR; } @@ -646,8 +1018,10 @@ static sparse_err_t sparse_cg_set_preconditioner_impl( try { static_cast< CG_Data< T, NZI, RSI > * >( handle )-> setPreconditioner( c_precond_p, c_precond_data_p ); - } catch(...) { - // spec says ALP vector allocation can only throw due to out-of-memory + } catch( ... ) { + // by virtue of the ALP interface guarantees, the only possible exception is + // due to out-of-memory conditions (when allocating an on-demand vector + // workspace for preconditioned solves). return OUT_OF_MEMORY; } return NO_ERROR; @@ -712,8 +1086,7 @@ static sparse_err_t sparse_cg_set_max_iter_count_impl( sparse_cg_handle_t handle, const size_t max_iters ) { if( handle == nullptr ) { return NULL_ARGUMENT; } - static_cast< CG_Data< T, NZI, RSI > * >( handle )-> - setMaxIters( max_iters ); + static_cast< CG_Data< T, NZI, RSI > * >( handle )->setMaxIters( max_iters ); return NO_ERROR; }