1
1
openmpi/opal/util/bit_ops.h
Rainer Keller 4e6a6fc146 - Check, whether the compiler supports __builtin_clz (count leading
zeroes);
   if so, use it for bit-operations like opal_cube_dim and opal_hibit.
   Implement two versions of power-of-two.
   In case of opal_next_poweroftwo, this reduces the average execution
   time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
   measured rdtsc, with loop over 2^27 values).
   Numbers for other functions are similar (but of course heavily depend
   on the usage, e.g. opal_hibit() with a start of 4 does not save
   much).  The bsr instruction on AMD Opteron is also not as fast.

 - Replace various places where the next power-of-two is computed.
   
   Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
   Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.

This commit was SVN r25270.
2011-10-11 22:49:01 +00:00

163 строки
4.3 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2011 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_BIT_OPS_H
#define OPAL_BIT_OPS_H
#include "opal/prefetch.h"
/**
* Calculates the highest bit in an integer
*
* @param value The integer value to examine
* @param start Position to start looking
*
* @returns pos Position of highest-set integer or -1 if none are set.
*
* Look at the integer "value" starting at position "start", and move
* to the right. Return the index of the highest bit that is set to
* 1.
*
* WARNING: *NO* error checking is performed. This is meant to be a
* fast inline function.
* Using __builtin_clz (count-leading-zeros) uses 3 cycles instead
* of 17 cycles (on average value, with start=32)
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
*/
static inline int opal_hibit(int value, int start)
{
unsigned int mask;
#if OPAL_C_HAVE_BUILTIN_CLZ
/* Only look at the part that the caller wanted looking at */
mask = value & ((1 << start) - 1);
if (OPAL_UNLIKELY (0 == mask)) {
return -1;
}
start = (8*sizeof(int)-1) - __builtin_clz(mask);
#else
--start;
mask = 1 << start;
for (; start >= 0; --start, mask >>= 1) {
if (value & mask) {
break;
}
}
#endif
return start;
}
/**
* Returns the cube dimension of a given value.
*
* @param value The integer value to examine
*
* @returns cubedim The smallest cube dimension containing that value
*
* Look at the integer "value" and calculate the smallest power of two
* dimension that contains that value.
*
* WARNING: *NO* error checking is performed. This is meant to be a
* fast inline function.
* Using __builtin_clz (count-leading-zeros) uses 3 cycles instead of 50 cycles
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
*/
static inline int opal_cube_dim(int value)
{
int dim, size;
#if OPAL_C_HAVE_BUILTIN_CLZ
if (OPAL_UNLIKELY (1 >= value)) {
return 0;
}
size = 8 * sizeof(int);
dim = size - __builtin_clz(value-1);
#else
for (dim = 0, size = 1; size < value; ++dim, size <<= 1) /* empty */;
#endif
return dim;
}
/**
* @brief Returns next power-of-two of the given value.
*
* @param value The integer value to return power of 2
*
* @returns The next power of two
*
* WARNING: *NO* error checking is performed. This is meant to be a
* fast inline function.
* Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 77
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
*/
static inline int opal_next_poweroftwo(int value)
{
int power2;
#if OPAL_C_HAVE_BUILTIN_CLZ
if (OPAL_UNLIKELY (0 == value)) {
return 1;
}
power2 = 1 << (8 * sizeof (int) - __builtin_clz(value));
#else
for (power2 = 1; value > 0; value >>= 1, power2 <<= 1) /* empty */;
#endif
return power2;
}
/**
* @brief Returns next power-of-two of the given value (and the value itselve if already power-of-two).
*
* @param value The integer value to return power of 2
*
* @returns The next power of two (inclusive)
*
* WARNING: *NO* error checking is performed. This is meant to be a
* fast inline function.
* Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 56
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
*/
static inline int opal_next_poweroftwo_inclusive(int value)
{
int power2;
#if OPAL_C_HAVE_BUILTIN_CLZ
if (OPAL_UNLIKELY (1 >= value)) {
return 1;
}
power2 = 1 << (8 * sizeof (int) - __builtin_clz(value - 1));
#else
for (power2 = 1 ; power2 < value; power2 <<= 1) /* empty */;
#endif
return power2;
}
#endif /* OPAL_BIT_OPS_H */