- Check, whether the compiler supports __builtin_clz (count leading

zeroes); if so, use it for bit-operations like opal_cube_dim and opal_hibit. Implement two versions of power-of-two. In case of opal_next_poweroftwo, this reduces the average execution time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining, measured rdtsc, with loop over 2^27 values). Numbers for other functions are similar (but of course heavily depend on the usage, e.g. opal_hibit() with a start of 4 does not save much). The bsr instruction on AMD Opteron is also not as fast. - Replace various places where the next power-of-two is computed. Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes. This commit was SVN r25270.
2011-10-11 22:49:01 +00:00 · 2011-10-11 22:49:01 +00:00 · 4e6a6fc146
--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -26,10 +26,11 @@

 #include <string.h>

+#include "opal/util/bit_ops.h"
 #include "opal/mca/installdirs/installdirs.h"
-#include "orte/util/show_help.h"
 #include "opal/util/output.h"
 #include "opal/mca/base/mca_base_param.h"
+#include "orte/util/show_help.h"
 #include "btl_openib.h"
 #include "btl_openib_mca.h"
 #include "btl_openib_ini.h"
@ -554,16 +555,10 @@ int btl_openib_register_mca_params(void)
            &mca_btl_openib_module.super));

    /* setup all the qp stuff */
-    mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4;
    /* round mid_qp_size to smallest power of two */
-    for(i = 31; i > 0; i--) {
-        if(!(mid_qp_size & (1<<i))) {
-            continue;
-        }
-        mid_qp_size = (1<<i);
-        break;
-    }
+    mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;

+    /* mid_qp_size = MAX (mid_qp_size, 1024); ?! */
    if(mid_qp_size <= 128) {
        mid_qp_size = 1024;
    }
--- a/ompi/mca/btl/sm/btl_sm.h
+++ b/ompi/mca/btl/sm/btl_sm.h
@ -39,6 +39,7 @@
 #include "knem_io.h"
 #endif  /* OMPI_BTL_SM_HAVE_KNEM */

+#include "opal/util/bit_ops.h"
 #include "opal/class/opal_free_list.h"
 #include "ompi/mca/btl/btl.h"
 #include "ompi/mca/common/sm/common_sm.h"
@ -265,9 +266,7 @@ static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
    int i, qsize;

    /* figure out the queue size (a power of two that is at least 1) */
-    qsize = 1;
-    while ( qsize < fifo_size )
-        qsize <<= 1;
+    qsize = opal_next_poweroftwo_inclusive (fifo_size);

    /* allocate the queue in the receiver's address space */
    fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
--- a/ompi/mca/btl/sm/btl_sm_component.c
+++ b/ompi/mca/btl/sm/btl_sm_component.c
@ -43,6 +43,7 @@

 #include "ompi/constants.h"
 #include "opal/mca/event/event.h"
+#include "opal/util/bit_ops.h"
 #include "opal/util/output.h"
 #include "orte/util/proc_info.h"
 #include "orte/util/show_help.h"
@ -225,13 +226,9 @@ static int sm_register(void)
 static int mca_btl_sm_component_open(void)
 {
    mca_btl_sm_component.sm_max_btls = 1;
+
    /* make sure the number of fifos is a power of 2 */
-    {
-        int i = 1;
-        while ( i < mca_btl_sm_component.nfifos )
-            i <<= 1;
-        mca_btl_sm_component.nfifos = i;
-    }
+    mca_btl_sm_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_sm_component.nfifos);

    /* make sure that queue size and lazy free parameter are compatible */
    if (mca_btl_sm_component.fifo_lazy_free >= (mca_btl_sm_component.fifo_size >> 1) )
--- a/ompi/mca/btl/wv/btl_wv_mca.c
+++ b/ompi/mca/btl/wv/btl_wv_mca.c
@ -26,10 +26,11 @@

 #include <string.h>

+#include "opal/util/bit_ops.h"
 #include "opal/mca/installdirs/installdirs.h"
-#include "orte/util/show_help.h"
 #include "opal/util/output.h"
 #include "opal/mca/base/mca_base_param.h"
+#include "orte/util/show_help.h"
 #include "btl_wv.h"
 #include "btl_wv_mca.h"
 #include "btl_wv_ini.h"
@ -471,16 +472,10 @@ int btl_wv_register_mca_params(void)
            &mca_btl_wv_module.super));

    /* setup all the qp stuff */
-    mid_qp_size = mca_btl_wv_module.super.btl_eager_limit / 4;
    /* round mid_qp_size to smallest power of two */
-    for(i = 31; i > 0; i--) {
-        if(!(mid_qp_size & (1<<i))) {
-            continue;
-        }
-        mid_qp_size = (1<<i);
-        break;
-    }
+    mid_qp_size = opal_next_poweroftwo (mca_btl_wv_module.super.btl_eager_limit / 4) >> 1;

+    /* mid_qp_size = MAX (mid_qp_size, 1024); ?! */
    if(mid_qp_size <= 128) {
        mid_qp_size = 1024;
    }
--- a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c
+++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c
@ -24,6 +24,7 @@
 #include <errno.h>

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/coll_tags.h"
@ -112,7 +113,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts,

    if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) &&
        (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) {
-        int tmp_size = 1, remain = 0, tmp_rank;
+        int tmp_size, remain = 0, tmp_rank;

        /* temporary receive buffer.  See coll_basic_reduce.c for details on sizing */
        recv_buf_free = (char*) malloc(buf_size);
@ -133,7 +134,7 @@ mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts,
        /* figure out power of two mapping: grow until larger than
           comm size, then go back one, to get the largest power of
           two less than comm size */
-        while (tmp_size <= size) tmp_size <<= 1;
+        tmp_size = opal_next_poweroftwo(size);
        tmp_size >>= 1;
        remain = size - tmp_size;

--- a/ompi/mca/coll/tuned/coll_tuned_allgather.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allgather.c
@ -20,6 +20,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
@ -271,7 +272,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
   size = ompi_comm_size(comm);
   rank = ompi_comm_rank(comm);

-   for (pow2size  = 1; pow2size <= size; pow2size <<=1);
+   pow2size = opal_next_poweroftwo (size);
   pow2size >>=1;

   /* Current implementation only handles power-of-two number of processes.
--- a/ompi/mca/coll/tuned/coll_tuned_allreduce.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allreduce.c
@ -20,6 +20,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
@ -170,7 +171,8 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
   tmprecv = (char*) rbuf;

   /* Determine nearest power of two less than or equal to size */
-   for (adjsize = 0x1; adjsize <= size; adjsize <<= 1); adjsize = adjsize >> 1;
+   adjsize = opal_next_poweroftwo (size);
+   adjsize >>= 1;

   /* Handle non-power-of-two case:
      - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and 
--- a/ompi/mca/coll/tuned/coll_tuned_barrier.c
+++ b/ompi/mca/coll/tuned/coll_tuned_barrier.c
@ -20,6 +20,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/coll.h"
@ -134,7 +135,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
                 rank));

    /* do nearest power of 2 less than size calc */
-    for( adjsize = 1; adjsize <= size; adjsize <<= 1 );
+    adjsize = opal_next_poweroftwo(size);
    adjsize >>= 1;

    /* if size is not exact power of two, perform an extra step */
@ -354,7 +355,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
                 rank));

    /* Find the nearest power of 2 of the communicator size. */
-    for(depth = 1; depth < size; depth <<= 1 );
+    depth = opal_next_poweroftwo_inclusive(size);

    for (jump=1; jump<depth; jump<<=1) {
        partner = rank ^ jump;
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@ -20,6 +20,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/coll.h"
@ -489,7 +490,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
    total_message_size *= dsize;

    /* compute the nearest power of 2 */
-    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
+    pow2 = opal_next_poweroftwo_inclusive (comm_size);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
@ -540,7 +541,7 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
                 " rank %d com_size %d msg_length %lu",
                 ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));

-    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+    pow2_size = opal_next_poweroftwo_inclusive (communicator_size);

    /* Decision based on MX 2Gb results from Grig cluster at 
       The University of Tennesse, Knoxville 
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c
@ -21,6 +21,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
@ -132,7 +133,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
 							    mca_coll_base_module_t *module)
 {
    int i, rank, size, count, err = OMPI_SUCCESS;
-    int tmp_size = 1, remain = 0, tmp_rank;
+    int tmp_size, remain = 0, tmp_rank;
    int *disps = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    char *recv_buf = NULL, *recv_buf_free = NULL;
@ -189,7 +190,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
    /* figure out power of two mapping: grow until larger than
       comm size, then go back one, to get the largest power of
       two less than comm size */
-    while (tmp_size <= size) tmp_size <<= 1;
+    tmp_size = opal_next_poweroftwo (size); 
    tmp_size >>= 1;
    remain = size - tmp_size;
   
--- a/ompi/mca/coll/tuned/coll_tuned_topo.c
+++ b/ompi/mca/coll/tuned/coll_tuned_topo.c
@ -19,6 +19,7 @@
 #include "ompi_config.h"

 #include "mpi.h"
+#include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/base/coll_tags.h"
@ -363,7 +364,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,

    if( index < 0 ) index += size;

-    while( mask <= index ) mask <<= 1;
+    mask = opal_next_poweroftwo(index);

    /* Now I can compute my father rank */
    if( root == rank ) {
--- a/opal/class/opal_hash_table.c
+++ b/opal/class/opal_hash_table.c
@ -25,6 +25,7 @@
 #include "opal/class/opal_list.h"
 #include "opal/class/opal_hash_table.h"
 #include "opal/constants.h"
+#include "opal/util/bit_ops.h"

 /*
 * opal_hash_table_t
@ -70,12 +71,7 @@ static void opal_hash_table_destruct(opal_hash_table_t* ht)
 int opal_hash_table_init(opal_hash_table_t* ht, size_t table_size)
 {
    size_t i;
-    size_t power2 = 1;
-    size_t tmp = table_size;
-    while(tmp) {
-       tmp >>= 1;
-       power2 <<= 1;
-    }
+    size_t power2 = opal_next_poweroftwo (table_size);

    ht->ht_mask = power2-1;
    ht->ht_table = (opal_list_t *)malloc(power2 * sizeof(opal_list_t));
--- a/opal/config/opal_setup_cc.m4
+++ b/opal/config/opal_setup_cc.m4
@ -246,7 +246,7 @@ AC_DEFUN([OPAL_SETUP_CC],[
        have_cc_builtin_expect=0
    fi
    AC_DEFINE_UNQUOTED([OPAL_C_HAVE_BUILTIN_EXPECT], [$have_cc_builtin_expect],
-          [Whether C compiler supports __builtin_expect])
+        [Whether C compiler supports __builtin_expect])

    # see if the C compiler supports __builtin_prefetch
    AC_CACHE_CHECK([if $CC supports __builtin_prefetch],
@ -262,7 +262,23 @@ AC_DEFUN([OPAL_SETUP_CC],[
        have_cc_builtin_prefetch=0
    fi
    AC_DEFINE_UNQUOTED([OPAL_C_HAVE_BUILTIN_PREFETCH], [$have_cc_builtin_prefetch],
-          [Whether C compiler supports __builtin_prefetch])
+        [Whether C compiler supports __builtin_prefetch])
+
+    # see if the C compiler supports __builtin_clz
+    AC_CACHE_CHECK([if $CC supports __builtin_clz],
+        [ompi_cv_cc_supports___builtin_clz],
+        [AC_TRY_LINK([],
+            [int value = 0xffff; /* we know we have 16 bits set */
+             if ((8*sizeof(int)-16) != __builtin_clz(value)) return 0;],
+            [ompi_cv_cc_supports___builtin_clz="yes"],
+            [ompi_cv_cc_supports___builtin_clz="no"])])
+    if test "$ompi_cv_cc_supports___builtin_clz" = "yes" ; then
+        have_cc_builtin_clz=1
+    else
+        have_cc_builtin_clz=0
+    fi
+    AC_DEFINE_UNQUOTED([OPAL_C_HAVE_BUILTIN_CLZ], [$have_cc_builtin_clz],
+        [Whether C compiler supports __builtin_clz])

    # Preload the optflags for the case where the user didn't specify
    # any.  If we're using GNU compilers, use -O3 (since it GNU
--- a/opal/util/bit_ops.h
+++ b/opal/util/bit_ops.h
@ -5,7 +5,7 @@
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ * Copyright (c) 2004-2011 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
@ -19,6 +19,8 @@
 #ifndef OPAL_BIT_OPS_H
 #define OPAL_BIT_OPS_H

+#include "opal/prefetch.h"
+
 /**
 * Calculates the highest bit in an integer
 *
@ -33,21 +35,35 @@
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
+ * Using __builtin_clz (count-leading-zeros) uses 3 cycles instead
+ * of 17 cycles (on average value, with start=32)
+ * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
 static inline int opal_hibit(int value, int start)
 {
-  unsigned int mask;
+    unsigned int mask;

-  --start;
-  mask = 1 << start;
+#if OPAL_C_HAVE_BUILTIN_CLZ
+    /* Only look at the part that the caller wanted looking at */
+    mask = value & ((1 << start) - 1);

-  for (; start >= 0; --start, mask >>= 1) {
-    if (value & mask) {
-      break;
+    if (OPAL_UNLIKELY (0 == mask)) {
+        return -1;
    }
-  }
+
+    start = (8*sizeof(int)-1) - __builtin_clz(mask);
+#else
+    --start;
+    mask = 1 << start;
+
+    for (; start >= 0; --start, mask >>= 1) {
+        if (value & mask) {
+            break;
+        }
+    }
+#endif
  
-  return start;
+    return start;
 }


@ -63,16 +79,84 @@ static inline int opal_hibit(int value, int start)
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
+ * Using __builtin_clz (count-leading-zeros) uses 3 cycles instead of 50 cycles
+ * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
 static inline int opal_cube_dim(int value) 
 {
    int dim, size;

-    for (dim = 0, size = 1; size < value; ++dim, size <<= 1) {
-        continue;
+#if OPAL_C_HAVE_BUILTIN_CLZ
+    if (OPAL_UNLIKELY (1 >= value)) {
+        return 0;
    }
+    size = 8 * sizeof(int);
+    dim = size - __builtin_clz(value-1); 
+#else
+    for (dim = 0, size = 1; size < value; ++dim, size <<= 1) /* empty */;
+#endif

    return dim;
 }

+
+/**
+ * @brief Returns next power-of-two of the given value.
+ *
+ * @param value The integer value to return power of 2
+ *
+ * @returns The next power of two
+ *
+ * WARNING: *NO* error checking is performed.  This is meant to be a
+ * fast inline function.
+ * Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 77
+ * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
+ */
+static inline int opal_next_poweroftwo(int value)
+{
+    int power2;
+
+#if OPAL_C_HAVE_BUILTIN_CLZ
+    if (OPAL_UNLIKELY (0 == value)) {
+        return 1;
+    }
+    power2 = 1 << (8 * sizeof (int) - __builtin_clz(value));
+#else
+    for (power2 = 1; value > 0; value >>= 1, power2 <<= 1) /* empty */;
+#endif
+
+    return power2;
+}
+
+
+/**
+ * @brief Returns next power-of-two of the given value (and the value itselve if already power-of-two).
+ *
+ * @param value The integer value to return power of 2
+ *
+ * @returns The next power of two (inclusive)
+ *
+ * WARNING: *NO* error checking is performed.  This is meant to be a
+ * fast inline function.
+ * Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 56
+ * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
+ */
+static inline int opal_next_poweroftwo_inclusive(int value)
+{
+    int power2;
+
+#if OPAL_C_HAVE_BUILTIN_CLZ
+    if (OPAL_UNLIKELY (1 >= value)) {
+        return 1;
+    }
+    power2 = 1 << (8 * sizeof (int) - __builtin_clz(value - 1));
+#else
+    for (power2 = 1 ; power2 < value; power2 <<= 1) /* empty */;
+#endif
+
+    return power2;
+}
+
+
 #endif /* OPAL_BIT_OPS_H */
+
--- a/test/util/Makefile.am
+++ b/test/util/Makefile.am
@ -5,7 +5,7 @@
 # Copyright (c) 2004-2005 The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
-# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+# Copyright (c) 2004-2011 High Performance Computing Center Stuttgart, 
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
@ -31,8 +31,9 @@ AM_CPPFLAGS = -I$(top_srcdir)/test/support


 check_PROGRAMS = \
-	opal_sos \
-	opal_path_nfs
+	opal_bit_ops \
+	opal_path_nfs \
+	opal_sos

 TESTS = \
 	$(check_PROGRAMS)
@ -66,6 +67,13 @@ TESTS = \
 #        $(top_builddir)/test/support/libsupport.a 
 #opal_basename_DEPENDENCIES = $(opal_basename_LDADD)

+opal_bit_ops_SOURCES = opal_bit_ops.c
+opal_bit_ops_LDADD = \
+        $(top_builddir)/opal/libopen-pal.la \
+        $(top_builddir)/test/support/libsupport.a
+opal_bit_ops_DEPENDENCIES = $(opal_path_nfs_LDADD)
+
+
 opal_path_nfs_SOURCES = opal_path_nfs.c
 opal_path_nfs_LDADD = \
        $(top_builddir)/opal/libopen-pal.la \
--- a/test/util/opal_bit_ops.c
+++ b/test/util/opal_bit_ops.c
@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2011 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "support.h"
+#include "opal/util/bit_ops.h"
+#include "opal/util/output.h"
+
+/*
+#define DEBUG
+*/
+
+static int test_hibit(int value, int start);
+static int test_cube_dim(int value);
+static int test_next_poweroftwo(int value);
+static int test_next_poweroftwo_inclusive(int value);
+
+int main(int argc, char* argv[])
+{
+    int i;
+    int vals[] = {0, 1, 2, 3, 4, 5, 127, 128, 129, (1 << 29) -1, (1 << 29), (1 << 29) +1, (1 << 30) -1, (1 << 30) /* And NOT (1 << 30) +1 */};
+    test_init("opal_bit_ops()");
+
+#ifdef DEBUG
+    printf ("Test usage: ./opal_bit_ops [VALUES]\n");
+#endif
+
+    if (1 < argc) {
+        for (i = 1; i < argc; i++) {
+            int value;
+            value = atoi (argv[i]);
+            printf ("Testing %d. argument test_next_poweroftwo(%d): %s\n",
+                    i, value, test_next_poweroftwo(value) ? "correct" : "wrong");
+        }
+    }
+
+    for (i = 0; i < (int)(sizeof(vals)/sizeof(vals[0])); i++) {
+        test_hibit (vals[i], 8 * sizeof(int) -2);
+        test_hibit (vals[i], 3);
+        test_cube_dim (vals[i]);
+        test_next_poweroftwo (vals[i]);
+        test_next_poweroftwo_inclusive (vals[i]);
+    }
+
+    /* All done */
+    return test_finalize();
+}
+
+
+/* REFERENCE FUNCTION */
+static int hibit(int value, int start)
+{
+    unsigned int mask;
+
+    --start;
+    mask = 1 << start;
+
+    for (; start >= 0; --start, mask >>= 1) {
+        if (value & mask) {
+            break;
+        }
+    }
+
+    return start;
+}
+
+static int test_hibit(int value, int start)
+{
+    int out;
+    int bit = hibit (value, start);
+
+#ifdef DEBUG
+    printf ("test_hibit(): value:%d expect:%d\n",
+            value, bit);
+#endif
+
+    if (bit == (out = opal_hibit (value, start))) {
+        test_success();
+        return 1;
+    } else {
+        char * msg;
+        asprintf(&msg, "Mismatch for hibit (w/ start:%d): value:%d, expected:%d got:%d\n",
+                 start, value, bit, out);
+        test_failure(msg);
+        free(msg);
+    }
+    return 0;
+}
+
+
+/* REFERENCE FUNCTION */
+static int cube_dim(int value)
+{
+    int dim, size;
+
+    for (dim = 0, size = 1; size < value; ++dim, size <<= 1);
+
+    return dim;
+}
+
+static int test_cube_dim(int value)
+{
+    int out;
+    int dim = cube_dim (value);
+
+#ifdef DEBUG
+    printf ("test_cube_dim(): value:%d expect:%d\n",
+            value, dim);
+#endif
+
+    if (dim == (out = opal_cube_dim (value))) {
+        test_success();
+        return 1;
+    } else {
+        char * msg;
+        asprintf(&msg, "Mismatch for cube_dim: value:%d, expected:%d got:%d\n",
+                 value, dim, out);
+        test_failure(msg);
+        free(msg);
+    }
+    return 0;
+}
+
+
+/* REFERENCE FUNCTION */
+static int next_poweroftwo(int value)
+{
+    int power2;
+
+    for (power2 = 1; value; value >>=1, power2 <<=1) /* empty */;
+
+    return power2;
+}
+
+
+static int test_next_poweroftwo(int value)
+{
+    int out;
+    int power2 = next_poweroftwo (value);
+
+#ifdef DEBUG
+    printf ("test_next_poweroftwo(): value:%d expect:%d\n",
+            value, power2);
+#endif
+
+    if (power2 == (out = opal_next_poweroftwo (value))) {
+        test_success();
+        return 1;
+    } else {
+        char * msg;
+        asprintf(&msg, "Mismatch for power-of-two: value:%d, expected:%d got:%d\n",
+                 value, power2, out);
+        test_failure(msg);
+        free(msg);
+    }
+    return 0;
+}
+
+
+
+/* REFERENCE FUNCTION */
+static int next_poweroftwo_inclusive(int value)
+{
+    int power2 = 1;
+
+    while ( power2 < value )
+        power2 <<= 1;
+
+    return power2;
+}
+
+static int test_next_poweroftwo_inclusive(int value)
+{
+    int out;
+    int power2 = next_poweroftwo_inclusive (value);
+
+#ifdef DEBUG
+    printf ("test_next_poweroftwo(): value:%d expect:%d\n",
+            value, power2);
+#endif
+
+    if (power2 == (out = opal_next_poweroftwo_inclusive (value))) {
+        test_success();
+        return 1;
+    } else {
+        char * msg;
+        asprintf(&msg, "Mismatch for power-of-two-inclusive: value:%d, expected:%d got:%d\n",
+                 value, power2, out);
+        test_failure(msg);
+        free(msg);
+    }
+
+    return 0;
+}
+
+
+
+