Issue #487: rename z_assigned to z_assign #37

New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
anyzelman merged 4 commits into develop from 487-z_assigned-is-named-confusingly-assign_z-seems-better
Aug 8, 2022
include/graphblas/reference/blas1.hpp
            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -5144,12 +5144,21 @@ namespace grb {
  
    		return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase );

    	}

    	// declare an internal version of eWiseMulAdd containing the full sparse & dense implementations

    	// declare an internal version of eWiseMulAdd containing the full

    	// sparse & dense implementations

    	namespace internal {

    		/**

    		 * \internal

    		 * This variant fuses the multiplication, addition, and folding into the

    		 * output vector while looping over the elements where the mask evaluates

    		 * true. Since the reference implementation does not support Theta(n-nz)

    		 * loops in cases where the mask is structural and inverted, it (statically)

    		 * asserts that the mask is not inverted.

    		 */

    		template<

    			Descriptor descr,

    			bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,

    			bool a_scalar, bool x_scalar, bool y_scalar,

    			typename OutputType, typename MaskType,

    			typename InputType1, typename InputType2, typename InputType3,

    			typename CoorsType, class Ring

    @@ -5183,7 +5192,7 @@ namespace grb {
  
    			assert( x != y );

    			assert( m_coors != nullptr );

    #ifdef NDEBUG

    			(void)n;

    			(void) n;

    #endif

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    @@ -5397,7 +5406,8 @@ namespace grb {
  
    #endif

    				// scalar coda and parallel main body

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    				internal::Coordinates< reference >::Update localUpdate = z_coors.EMPTY_UPDATE();

    				internal::Coordinates< reference >::Update localUpdate =

    					z_coors.EMPTY_UPDATE();

    				const size_t maxAsyncAssigns = z_coors.maxAsyncAssigns();

    				size_t asyncAssigns = 0;

    #endif

    @@ -5413,48 +5423,48 @@ namespace grb {
  
    					) {

    						const InputType1 * const a_p = a + ( a_scalar ? 0 : index );

    						const InputType2 * const x_p = x + ( x_scalar ? 0 : index );

    						(void)apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );

    						(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );

    						if( y_scalar || y_coors->assigned( index ) ) {

    							const InputType3 * const y_p = y + ( y_scalar ? 0 : index );

    							typename Ring::D4 b;

    							(void)apply( b, t, *y_p, ring.getAdditiveOperator() );

    							(void) apply( b, t, *y_p, ring.getAdditiveOperator() );

    							if( z_coors.assigned( index ) ) {

    								typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );

    								(void)foldr( b, out, ring.getAdditiveOperator() );

    								(void) foldr( b, out, ring.getAdditiveOperator() );

    								z[ index ] = static_cast< OutputType >( out );

    							} else {

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    								(void)z_coors.asyncAssign( index, localUpdate );

    								(void)++asyncAssigns;

    								(void) z_coors.asyncAssign( index, localUpdate );

    								(void) ++asyncAssigns;

    #else

    								(void)z_coors.assign( index );

    								(void) z_coors.assign( index );

    #endif

    								z[ index ] = static_cast< OutputType >( b );

    							}

    						} else if( z_coors.assigned( index ) ) {

    							typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );

    							(void)foldr( t, out, ring.getAdditiveOperator() );

    							(void) foldr( t, out, ring.getAdditiveOperator() );

    							z[ index ] = static_cast< OutputType >( out );

    						} else {

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    							(void)z_coors.asyncAssign( index, localUpdate );

    							(void)++asyncAssigns;

    							(void) z_coors.asyncAssign( index, localUpdate );

    							(void) ++asyncAssigns;

    #else

    							(void)z_coors.assign( index );

    							(void) z_coors.assign( index );

    #endif

    							z[ index ] = static_cast< OutputType >( t );

    						}

    					} else if( y_coors->assigned( index ) ) {

    						if( z_coors.assigned( index ) ) {

    							typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );

    							(void)foldr( y[ index ], out, ring.getAdditiveOperator() );

    							(void) foldr( y[ index ], out, ring.getAdditiveOperator() );

    							z[ index ] = static_cast< OutputType >( out );

    						} else {

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    							(void)z_coors.asyncAssign( index, localUpdate );

    							(void)++asyncAssigns;

    							(void) z_coors.asyncAssign( index, localUpdate );

    							(void) ++asyncAssigns;

    #else

    							(void)z_coors.assign( index );

    							(void) z_coors.assign( index );

    #endif

    							z[ index ] = static_cast< OutputType >( t );

    						}

    @@ -5463,7 +5473,7 @@ namespace grb {
  
    					if( asyncAssigns == maxAsyncAssigns ) {

    						const bool was_empty = z_coors.joinUpdate( localUpdate );

    #ifdef NDEBUG

    						(void)was_empty;

    						(void) was_empty;

    #else

    						assert( !was_empty );

    #endif

    @@ -5479,12 +5489,14 @@ namespace grb {
  
    		}

    		/**

    		 * \internal

    		 * Call this version if consuming the multiplication first and performing the

    		 * addition separately is cheaper than fusing the computations.

    		 * addition separately is cheaper than fusing the computations as done in the

    		 * mask-driven variant.

    		 */

    		template<

    			Descriptor descr,

    			bool masked, bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,

    			bool masked, bool a_scalar, bool x_scalar, bool y_scalar,

    			typename OutputType, typename MaskType,

    			typename InputType1, typename InputType2, typename InputType3,

    			typename CoorsType, class Ring

    @@ -5538,7 +5550,7 @@ namespace grb {
  
    						typename Ring::D3 t;

    						const InputType1 * const a_p = a_scalar ? a : a + index;

    						const InputType2 * const x_p = x_scalar ? x : x + index;

    						(void)apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );

    						(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    						if( z_coors.asyncAssign( index, localUpdate ) ) {

    #else

    @@ -5552,9 +5564,9 @@ namespace grb {
  
    									static_cast< typename Ring::D4 >( t )

    								);

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    								(void)++asyncAssigns;

    								(void) ++asyncAssigns;

    								if( asyncAssigns == maxAsyncAssigns ) {

    									(void)z_coors.joinUpdate( localUpdate );

    									(void) z_coors.joinUpdate( localUpdate );

    									asyncAssigns = 0;

    								}

    #endif

    @@ -5571,7 +5583,8 @@ namespace grb {
  
    				if( y_scalar ) {

    					return foldl< descr >( z_vector, *m_vector, *y, ring.getAdditiveMonoid() );

    				} else {

    					return foldl< descr >( z_vector, *m_vector, *y_vector, ring.getAdditiveMonoid() );

    					return foldl< descr >( z_vector, *m_vector, *y_vector,

    						ring.getAdditiveMonoid() );

    				}

    			} else {

    				if( y_scalar ) {

    @@ -5582,10 +5595,22 @@ namespace grb {
  
    			}

    		}

    		/** In this variant, all vector inputs (and output) is dense. */

    		/**

    		 * \internal

    		 * In this variant, all vector input vectors, except potentially the

    		 * input/output vector \a z, are dense.

    		 *

    		 * If \a z was not dense, it is assumed to be empty.

    		 *

    		 * @tparam assign_z True if \a z was empty, false otherwise.

    		 *

    		 * This implement the eWiseMulAdd using a direct Theta(n) loop. This variant

    		 * is cheaper than any of the sparse variants when the output is dense.

    		 * \endinternal

    		 */

    		template<

    			Descriptor descr,

    			bool a_scalar, bool x_scalar, bool y_scalar, bool z_assigned,

    			bool a_scalar, bool x_scalar, bool y_scalar, bool assign_z,

    			typename OutputType, typename InputType1,

    			typename InputType2, typename InputType3,

    			typename CoorsType, class Ring

    @@ -5651,10 +5676,12 @@ namespace grb {
  
    				// do vectorised out-of-place operations. Allows for aligned overlap.

    				// Non-aligned ovelap is not possible due to GraphBLAS semantics.

    				size_t i = start;

    				// note: read the tail code (under this while loop) comments first for greater understanding

    				// note: read the tail code (under this while loop) comments first for

    				//       greater understanding

    				while( i + Ring::blocksize <= end ) {

    #ifdef _DEBUG

    					std::cout << "\tdense_eWiseMulAdd: handling block of size " << Ring::blocksize << " starting at index " << i << "\n";

    					std::cout << "\tdense_eWiseMulAdd: handling block of size " <<

    						Ring::blocksize << " starting at index " << i << "\n";

    #endif

    					// read-in

    					if( !a_scalar ) {

    @@ -5672,7 +5699,7 @@ namespace grb {
  
    							yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] );

    						}

    					}

    					if( !z_assigned ) {

    					if( !assign_z ) {

    						for( size_t b = 0; b < Ring::blocksize; ++b ) {

    							zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] );

    						}

    @@ -5683,14 +5710,14 @@ namespace grb {
  
    						apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );

    						apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );

    					}

    					if( !z_assigned ) {

    					if( !assign_z ) {

    						for( size_t b = 0; b < Ring::blocksize; ++b ) {

    							foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() );

    						}

    					}

    					// write-out

    					if( z_assigned ) {

    					if( assign_z ) {

    						for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {

    							z[ i ] = static_cast< OutputType >( bb[ b ] );

    						}

    @@ -5730,23 +5757,23 @@ namespace grb {
  
    					foldr( ts, ys, ring.getAdditiveOperator() );

    					// write out

    					if( z_assigned ) {

    					if( assign_z ) {

    						*z = static_cast< OutputType >( ys );

    					} else {

    						foldr( ys, *z, ring.getAdditiveOperator() );

    					}

    					// move pointers

    					if( !a_scalar ) {

    						(void)a++;

    						(void) a++;

    					}

    					if( !x_scalar ) {

    						(void)x++;

    						(void) x++;

    					}

    					if( !y_scalar ) {

    						(void)y++;

    						(void) y++;

    					}

    					(void)z++;

    					(void) z++;

    				}

    #ifdef _H_GRB_REFERENCE_OMP_BLAS1

    			} // end OpenMP parallel section

    @@ -5759,7 +5786,22 @@ namespace grb {
  
    		/**

    		 * \internal

    		 * Depending on sparsity patterns, there are many variants of eWiseMulAdd.

    		 * This function identifies and calls the most opportune variants.

    		 * This function identifies and calls the most opportune variant. Also used

    		 * to implement eWiseMul, though in that case the implementation could still

    		 * be optimised by e.g. a bool that indicates whether y is zero (internal

    		 * issue #488).

    		 *

    		 * @tparam a_scalar Whether \a a is a scalar or a vector.

    		 *

    		 * (and so on for \a x and \a y).

    		 *

    		 * @tparam assign_y Whether to simply assign to \a y or whether to

    		 *                  (potentially) fold into \a y (in case there are

    		 *                  pre-existing elements)

    		 *

    		 * The other arguments pertain to the output, the mask, and the input vectors

    		 * as well as their sizes-- and finally the semiring under which to perform

    		 * the requested computation.

    		 * \endinternal

    		 */

    		template<

    @@ -5803,6 +5845,12 @@ namespace grb {
  
    				m_coors->nonzeroes() == n

    			);

    			const size_t z_nns = nnz( z_vector );

    			// the below Boolean shall be true only if the inputs a, x, and y generate

    			// a dense output vector. It furthermore shall be set to false only if the

    			// output vector was either empty or fully dense. This is done to determine

    			// the exact case the dense variant of the eWiseMulAdd implementations can

    			// be used.

    			const bool sparse = ( a_scalar ? false : ( a_coors->nonzeroes() < n ) ) ||

    				( x_scalar ? false : ( x_coors->nonzeroes() < n ) ) ||

    				( y_scalar ? false : ( y_coors->nonzeroes() < n ) ) ||

    @@ -5815,13 +5863,13 @@ namespace grb {
  
    				return ILLEGAL;

    			}

    			// pre-assign coors if output is dense but previously empty

    			const bool z_assigned = z_nns == 0 &&

    				( y_scalar || y_coors->nonzeroes() == n ) &&

    				( !masked || mask_is_dense );

    			if( z_assigned ) {

    			// pre-assign coors if output is dense but was previously totally empty

    			const bool assign_z = z_nns == 0 && !sparse;

    			if( assign_z ) {

    #ifdef _DEBUG

    				std::cout << "\teWiseMulAdd_dispatch: detected output will be dense, pre-assigning all output coordinates\n";

    				std::cout << "\teWiseMulAdd_dispatch: detected output will be dense while "

    					<< "the output vector presently is completely empty. We therefore "

    					<< "pre-assign all output coordinates\n";

    #endif

    				internal::getCoordinates( z_vector ).assignAll();

    			}

    @@ -5843,10 +5891,12 @@ namespace grb {
  
    				        (x_scalar ? n : x_coors->nonzeroes())

    				    ) // min is worst-case, best case is 0, realistic is some a priori unknown

    				      // problem-dependent overlap

    				std::cout << "\t\teWiseMulAdd_dispatch: add_loop_size = " << add_loop_size << "\n";

    				std::cout << "\t\teWiseMulAdd_dispatch: add_loop_size = " << add_loop_size

    					<< "\n";

    				;*/

    #ifdef _DEBUG

    				std::cout << "\t\teWiseMulAdd_dispatch: mul_loop_size = " << mul_loop_size << "\n";

    				std::cout << "\t\teWiseMulAdd_dispatch: mul_loop_size = " << mul_loop_size

    					<< "\n";

    #endif

    				if( masked ) {

    					const size_t mask_loop_size = 5 * m_coors->nonzeroes();

    @@ -5860,38 +5910,28 @@ namespace grb {
  
    #ifdef _DEBUG

    						std::cout << "\teWiseMulAdd_dispatch: will be driven by output mask\n";

    #endif

    						if( z_assigned ) {

    							return sparse_eWiseMulAdd_maskDriven<

    								descr, a_scalar, x_scalar, y_scalar, true

    							>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );

    						} else {

    							return sparse_eWiseMulAdd_maskDriven<

    								descr, a_scalar, x_scalar, y_scalar, false

    							>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );

    						}

    						return sparse_eWiseMulAdd_maskDriven<

    							descr, a_scalar, x_scalar, y_scalar

    						>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );

    					}

    				}

    				// internal issue #42 (closed), see also above:

    				// if( mul_loop_size < add_loop_size ) {

    #ifdef _DEBUG

    				std::cout << "\teWiseMulAdd_dispatch: will be driven by the multiplication a*x\n";

    #endif

    				if( z_assigned ) {

    					return twoPhase_sparse_eWiseMulAdd_mulDriven<

    						descr, masked, a_scalar, x_scalar, y_scalar, true

    					>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );

    				} else {

    					return twoPhase_sparse_eWiseMulAdd_mulDriven<

    						descr, masked, a_scalar, x_scalar, y_scalar, false

    					>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );

    				}

    				return twoPhase_sparse_eWiseMulAdd_mulDriven<

    					descr, masked, a_scalar, x_scalar, y_scalar

    				>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );

    				/* internal issue #42 (closed), see also above:

    				} else {

    #ifdef _DEBUG

    				    std::cout << "\teWiseMulAdd_dispatch: will be driven by the addition with y\n";

    #endif

    				    if( z_assigned ) {

    				        return twoPhase_sparse_eWiseMulAdd_addPhase1< descr, masked, a_scalar, x_scalar, y_scalar, true >(

    				    if( assign_z ) {

    				        return twoPhase_sparse_eWiseMulAdd_addPhase1<

    						descr, masked, a_scalar, x_scalar, y_scalar, true

    					>(

    				            z_vector,

    				            m, m_coors,

    				            a, a_coors,

    @@ -5900,7 +5940,9 @@ namespace grb {
  
    				            n, ring

    				        );

    				    } else {

    				        return twoPhase_sparse_eWiseMulAdd_addPhase1< descr, masked, a_scalar, x_scalar, y_scalar, false >(

    				        return twoPhase_sparse_eWiseMulAdd_addPhase1<

    						descr, masked, a_scalar, x_scalar, y_scalar, false

    					>(

    				            z_vector,

    				            m, m_coors,

    				            a, a_coors,

    @@ -5921,7 +5963,7 @@ namespace grb {
  
    #ifdef _DEBUG

    			std::cout << "\teWiseMulAdd_dispatch: will perform a dense eWiseMulAdd\n";

    #endif

    			if( z_assigned ) {

    			if( assign_z ) {

    				return dense_eWiseMulAdd<

    					descr, a_scalar, x_scalar, y_scalar, true

    				>( z_vector, a, x, y, n, ring );
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Issue #487: rename z_assigned to z_assign #37

Uh oh!

Diff view

Diff view

There are no files selected for viewing