Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 110 additions & 68 deletions include/graphblas/reference/blas1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5144,12 +5144,21 @@ namespace grb {
return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase );
}

// declare an internal version of eWiseMulAdd containing the full sparse & dense implementations
// declare an internal version of eWiseMulAdd containing the full
// sparse & dense implementations
namespace internal {

/**
* \internal
* This variant fuses the multiplication, addition, and folding into the
* output vector while looping over the elements where the mask evaluates
* true. Since the reference implementation does not support Theta(n-nz)
* loops in cases where the mask is structural and inverted, it (statically)
* asserts that the mask is not inverted.
*/
template<
Descriptor descr,
bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,
bool a_scalar, bool x_scalar, bool y_scalar,
typename OutputType, typename MaskType,
typename InputType1, typename InputType2, typename InputType3,
typename CoorsType, class Ring
Expand Down Expand Up @@ -5183,7 +5192,7 @@ namespace grb {
assert( x != y );
assert( m_coors != nullptr );
#ifdef NDEBUG
(void)n;
(void) n;
#endif

#ifdef _H_GRB_REFERENCE_OMP_BLAS1
Expand Down Expand Up @@ -5397,7 +5406,8 @@ namespace grb {
#endif
// scalar coda and parallel main body
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
internal::Coordinates< reference >::Update localUpdate = z_coors.EMPTY_UPDATE();
internal::Coordinates< reference >::Update localUpdate =
z_coors.EMPTY_UPDATE();
const size_t maxAsyncAssigns = z_coors.maxAsyncAssigns();
size_t asyncAssigns = 0;
#endif
Expand All @@ -5413,48 +5423,48 @@ namespace grb {
) {
const InputType1 * const a_p = a + ( a_scalar ? 0 : index );
const InputType2 * const x_p = x + ( x_scalar ? 0 : index );
(void)apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
if( y_scalar || y_coors->assigned( index ) ) {
const InputType3 * const y_p = y + ( y_scalar ? 0 : index );
typename Ring::D4 b;
(void)apply( b, t, *y_p, ring.getAdditiveOperator() );
(void) apply( b, t, *y_p, ring.getAdditiveOperator() );
if( z_coors.assigned( index ) ) {
typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
(void)foldr( b, out, ring.getAdditiveOperator() );
(void) foldr( b, out, ring.getAdditiveOperator() );
z[ index ] = static_cast< OutputType >( out );
} else {
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
(void)z_coors.asyncAssign( index, localUpdate );
(void)++asyncAssigns;
(void) z_coors.asyncAssign( index, localUpdate );
(void) ++asyncAssigns;
#else
(void)z_coors.assign( index );
(void) z_coors.assign( index );
#endif
z[ index ] = static_cast< OutputType >( b );
}
} else if( z_coors.assigned( index ) ) {
typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
(void)foldr( t, out, ring.getAdditiveOperator() );
(void) foldr( t, out, ring.getAdditiveOperator() );
z[ index ] = static_cast< OutputType >( out );
} else {
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
(void)z_coors.asyncAssign( index, localUpdate );
(void)++asyncAssigns;
(void) z_coors.asyncAssign( index, localUpdate );
(void) ++asyncAssigns;
#else
(void)z_coors.assign( index );
(void) z_coors.assign( index );
#endif
z[ index ] = static_cast< OutputType >( t );
}
} else if( y_coors->assigned( index ) ) {
if( z_coors.assigned( index ) ) {
typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
(void)foldr( y[ index ], out, ring.getAdditiveOperator() );
(void) foldr( y[ index ], out, ring.getAdditiveOperator() );
z[ index ] = static_cast< OutputType >( out );
} else {
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
(void)z_coors.asyncAssign( index, localUpdate );
(void)++asyncAssigns;
(void) z_coors.asyncAssign( index, localUpdate );
(void) ++asyncAssigns;
#else
(void)z_coors.assign( index );
(void) z_coors.assign( index );
#endif
z[ index ] = static_cast< OutputType >( t );
}
Expand All @@ -5463,7 +5473,7 @@ namespace grb {
if( asyncAssigns == maxAsyncAssigns ) {
const bool was_empty = z_coors.joinUpdate( localUpdate );
#ifdef NDEBUG
(void)was_empty;
(void) was_empty;
#else
assert( !was_empty );
#endif
Expand All @@ -5479,12 +5489,14 @@ namespace grb {
}

/**
* \internal
* Call this version if consuming the multiplication first and performing the
* addition separately is cheaper than fusing the computations.
* addition separately is cheaper than fusing the computations as done in the
* mask-driven variant.
*/
template<
Descriptor descr,
bool masked, bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,
bool masked, bool a_scalar, bool x_scalar, bool y_scalar,
typename OutputType, typename MaskType,
typename InputType1, typename InputType2, typename InputType3,
typename CoorsType, class Ring
Expand Down Expand Up @@ -5538,7 +5550,7 @@ namespace grb {
typename Ring::D3 t;
const InputType1 * const a_p = a_scalar ? a : a + index;
const InputType2 * const x_p = x_scalar ? x : x + index;
(void)apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
if( z_coors.asyncAssign( index, localUpdate ) ) {
#else
Expand All @@ -5552,9 +5564,9 @@ namespace grb {
static_cast< typename Ring::D4 >( t )
);
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
(void)++asyncAssigns;
(void) ++asyncAssigns;
if( asyncAssigns == maxAsyncAssigns ) {
(void)z_coors.joinUpdate( localUpdate );
(void) z_coors.joinUpdate( localUpdate );
asyncAssigns = 0;
}
#endif
Expand All @@ -5571,7 +5583,8 @@ namespace grb {
if( y_scalar ) {
return foldl< descr >( z_vector, *m_vector, *y, ring.getAdditiveMonoid() );
} else {
return foldl< descr >( z_vector, *m_vector, *y_vector, ring.getAdditiveMonoid() );
return foldl< descr >( z_vector, *m_vector, *y_vector,
ring.getAdditiveMonoid() );
}
} else {
if( y_scalar ) {
Expand All @@ -5582,10 +5595,22 @@ namespace grb {
}
}

/** In this variant, all vector inputs (and output) is dense. */
/**
* \internal
* In this variant, all vector input vectors, except potentially the
* input/output vector \a z, are dense.
*
* If \a z was not dense, it is assumed to be empty.
*
* @tparam assign_z True if \a z was empty, false otherwise.
*
* This implement the eWiseMulAdd using a direct Theta(n) loop. This variant
* is cheaper than any of the sparse variants when the output is dense.
* \endinternal
*/
template<
Descriptor descr,
bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,
bool a_scalar, bool x_scalar, bool y_scalar, bool assign_z,
typename OutputType, typename InputType1,
typename InputType2, typename InputType3,
typename CoorsType, class Ring
Expand Down Expand Up @@ -5651,10 +5676,12 @@ namespace grb {
// do vectorised out-of-place operations. Allows for aligned overlap.
// Non-aligned ovelap is not possible due to GraphBLAS semantics.
size_t i = start;
// note: read the tail code (under this while loop) comments first for greater understanding
// note: read the tail code (under this while loop) comments first for
// greater understanding
while( i + Ring::blocksize <= end ) {
#ifdef _DEBUG
std::cout << "\tdense_eWiseMulAdd: handling block of size " << Ring::blocksize << " starting at index " << i << "\n";
std::cout << "\tdense_eWiseMulAdd: handling block of size " <<
Ring::blocksize << " starting at index " << i << "\n";
#endif
// read-in
if( !a_scalar ) {
Expand All @@ -5672,7 +5699,7 @@ namespace grb {
yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] );
}
}
if( !z_assigned ) {
if( !assign_z ) {
for( size_t b = 0; b < Ring::blocksize; ++b ) {
zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] );
}
Expand All @@ -5683,14 +5710,14 @@ namespace grb {
apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
}
if( !z_assigned ) {
if( !assign_z ) {
for( size_t b = 0; b < Ring::blocksize; ++b ) {
foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() );
}
}

// write-out
if( z_assigned ) {
if( assign_z ) {
for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
z[ i ] = static_cast< OutputType >( bb[ b ] );
}
Expand Down Expand Up @@ -5730,23 +5757,23 @@ namespace grb {
foldr( ts, ys, ring.getAdditiveOperator() );

// write out
if( z_assigned ) {
if( assign_z ) {
*z = static_cast< OutputType >( ys );
} else {
foldr( ys, *z, ring.getAdditiveOperator() );
}

// move pointers
if( !a_scalar ) {
(void)a++;
(void) a++;
}
if( !x_scalar ) {
(void)x++;
(void) x++;
}
if( !y_scalar ) {
(void)y++;
(void) y++;
}
(void)z++;
(void) z++;
}
#ifdef _H_GRB_REFERENCE_OMP_BLAS1
} // end OpenMP parallel section
Expand All @@ -5759,7 +5786,22 @@ namespace grb {
/**
* \internal
* Depending on sparsity patterns, there are many variants of eWiseMulAdd.
* This function identifies and calls the most opportune variants.
* This function identifies and calls the most opportune variant. Also used
* to implement eWiseMul, though in that case the implementation could still
* be optimised by e.g. a bool that indicates whether y is zero (internal
* issue #488).
*
* @tparam a_scalar Whether \a a is a scalar or a vector.
*
* (and so on for \a x and \a y).
*
* @tparam assign_y Whether to simply assign to \a y or whether to
* (potentially) fold into \a y (in case there are
* pre-existing elements)
*
* The other arguments pertain to the output, the mask, and the input vectors
* as well as their sizes-- and finally the semiring under which to perform
* the requested computation.
* \endinternal
*/
template<
Expand Down Expand Up @@ -5803,6 +5845,12 @@ namespace grb {
m_coors->nonzeroes() == n
);
const size_t z_nns = nnz( z_vector );

// the below Boolean shall be true only if the inputs a, x, and y generate
// a dense output vector. It furthermore shall be set to false only if the
// output vector was either empty or fully dense. This is done to determine
// the exact case the dense variant of the eWiseMulAdd implementations can
// be used.
const bool sparse = ( a_scalar ? false : ( a_coors->nonzeroes() < n ) ) ||
( x_scalar ? false : ( x_coors->nonzeroes() < n ) ) ||
( y_scalar ? false : ( y_coors->nonzeroes() < n ) ) ||
Expand All @@ -5815,13 +5863,13 @@ namespace grb {
return ILLEGAL;
}

// pre-assign coors if output is dense but previously empty
const bool z_assigned = z_nns == 0 &&
( y_scalar || y_coors->nonzeroes() == n ) &&
( !masked || mask_is_dense );
if( z_assigned ) {
// pre-assign coors if output is dense but was previously totally empty
const bool assign_z = z_nns == 0 && !sparse;
if( assign_z ) {
#ifdef _DEBUG
std::cout << "\teWiseMulAdd_dispatch: detected output will be dense, pre-assigning all output coordinates\n";
std::cout << "\teWiseMulAdd_dispatch: detected output will be dense while "
<< "the output vector presently is completely empty. We therefore "
<< "pre-assign all output coordinates\n";
#endif
internal::getCoordinates( z_vector ).assignAll();
}
Expand All @@ -5843,10 +5891,12 @@ namespace grb {
(x_scalar ? n : x_coors->nonzeroes())
) // min is worst-case, best case is 0, realistic is some a priori unknown
// problem-dependent overlap
std::cout << "\t\teWiseMulAdd_dispatch: add_loop_size = " << add_loop_size << "\n";
std::cout << "\t\teWiseMulAdd_dispatch: add_loop_size = " << add_loop_size
<< "\n";
;*/
#ifdef _DEBUG
std::cout << "\t\teWiseMulAdd_dispatch: mul_loop_size = " << mul_loop_size << "\n";
std::cout << "\t\teWiseMulAdd_dispatch: mul_loop_size = " << mul_loop_size
<< "\n";
#endif
if( masked ) {
const size_t mask_loop_size = 5 * m_coors->nonzeroes();
Expand All @@ -5860,38 +5910,28 @@ namespace grb {
#ifdef _DEBUG
std::cout << "\teWiseMulAdd_dispatch: will be driven by output mask\n";
#endif
if( z_assigned ) {
return sparse_eWiseMulAdd_maskDriven<
descr, a_scalar, x_scalar, y_scalar, true
>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );
} else {
return sparse_eWiseMulAdd_maskDriven<
descr, a_scalar, x_scalar, y_scalar, false
>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );
}
return sparse_eWiseMulAdd_maskDriven<
descr, a_scalar, x_scalar, y_scalar
>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );
}
}
// internal issue #42 (closed), see also above:
// if( mul_loop_size < add_loop_size ) {
#ifdef _DEBUG
std::cout << "\teWiseMulAdd_dispatch: will be driven by the multiplication a*x\n";
#endif
if( z_assigned ) {
return twoPhase_sparse_eWiseMulAdd_mulDriven<
descr, masked, a_scalar, x_scalar, y_scalar, true
>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );
} else {
return twoPhase_sparse_eWiseMulAdd_mulDriven<
descr, masked, a_scalar, x_scalar, y_scalar, false
>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );
}
return twoPhase_sparse_eWiseMulAdd_mulDriven<
descr, masked, a_scalar, x_scalar, y_scalar
>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );
/* internal issue #42 (closed), see also above:
} else {
#ifdef _DEBUG
std::cout << "\teWiseMulAdd_dispatch: will be driven by the addition with y\n";
#endif
if( z_assigned ) {
return twoPhase_sparse_eWiseMulAdd_addPhase1< descr, masked, a_scalar, x_scalar, y_scalar, true >(
if( assign_z ) {
return twoPhase_sparse_eWiseMulAdd_addPhase1<
descr, masked, a_scalar, x_scalar, y_scalar, true
>(
z_vector,
m, m_coors,
a, a_coors,
Expand All @@ -5900,7 +5940,9 @@ namespace grb {
n, ring
);
} else {
return twoPhase_sparse_eWiseMulAdd_addPhase1< descr, masked, a_scalar, x_scalar, y_scalar, false >(
return twoPhase_sparse_eWiseMulAdd_addPhase1<
descr, masked, a_scalar, x_scalar, y_scalar, false
>(
z_vector,
m, m_coors,
a, a_coors,
Expand All @@ -5921,7 +5963,7 @@ namespace grb {
#ifdef _DEBUG
std::cout << "\teWiseMulAdd_dispatch: will perform a dense eWiseMulAdd\n";
#endif
if( z_assigned ) {
if( assign_z ) {
return dense_eWiseMulAdd<
descr, a_scalar, x_scalar, y_scalar, true
>( z_vector, a, x, y, n, ring );
Expand Down