4e6a6fc146
zeroes); if so, use it for bit-operations like opal_cube_dim and opal_hibit. Implement two versions of power-of-two. In case of opal_next_poweroftwo, this reduces the average execution time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining, measured rdtsc, with loop over 2^27 values). Numbers for other functions are similar (but of course heavily depend on the usage, e.g. opal_hibit() with a start of 4 does not save much). The bsr instruction on AMD Opteron is also not as fast. - Replace various places where the next power-of-two is computed. Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes. This commit was SVN r25270.
163 строки
4.3 KiB
C
163 строки
4.3 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2011 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef OPAL_BIT_OPS_H
|
|
#define OPAL_BIT_OPS_H
|
|
|
|
#include "opal/prefetch.h"
|
|
|
|
/**
|
|
* Calculates the highest bit in an integer
|
|
*
|
|
* @param value The integer value to examine
|
|
* @param start Position to start looking
|
|
*
|
|
* @returns pos Position of highest-set integer or -1 if none are set.
|
|
*
|
|
* Look at the integer "value" starting at position "start", and move
|
|
* to the right. Return the index of the highest bit that is set to
|
|
* 1.
|
|
*
|
|
* WARNING: *NO* error checking is performed. This is meant to be a
|
|
* fast inline function.
|
|
* Using __builtin_clz (count-leading-zeros) uses 3 cycles instead
|
|
* of 17 cycles (on average value, with start=32)
|
|
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
|
|
*/
|
|
static inline int opal_hibit(int value, int start)
|
|
{
|
|
unsigned int mask;
|
|
|
|
#if OPAL_C_HAVE_BUILTIN_CLZ
|
|
/* Only look at the part that the caller wanted looking at */
|
|
mask = value & ((1 << start) - 1);
|
|
|
|
if (OPAL_UNLIKELY (0 == mask)) {
|
|
return -1;
|
|
}
|
|
|
|
start = (8*sizeof(int)-1) - __builtin_clz(mask);
|
|
#else
|
|
--start;
|
|
mask = 1 << start;
|
|
|
|
for (; start >= 0; --start, mask >>= 1) {
|
|
if (value & mask) {
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the cube dimension of a given value.
|
|
*
|
|
* @param value The integer value to examine
|
|
*
|
|
* @returns cubedim The smallest cube dimension containing that value
|
|
*
|
|
* Look at the integer "value" and calculate the smallest power of two
|
|
* dimension that contains that value.
|
|
*
|
|
* WARNING: *NO* error checking is performed. This is meant to be a
|
|
* fast inline function.
|
|
* Using __builtin_clz (count-leading-zeros) uses 3 cycles instead of 50 cycles
|
|
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
|
|
*/
|
|
static inline int opal_cube_dim(int value)
|
|
{
|
|
int dim, size;
|
|
|
|
#if OPAL_C_HAVE_BUILTIN_CLZ
|
|
if (OPAL_UNLIKELY (1 >= value)) {
|
|
return 0;
|
|
}
|
|
size = 8 * sizeof(int);
|
|
dim = size - __builtin_clz(value-1);
|
|
#else
|
|
for (dim = 0, size = 1; size < value; ++dim, size <<= 1) /* empty */;
|
|
#endif
|
|
|
|
return dim;
|
|
}
|
|
|
|
|
|
/**
|
|
* @brief Returns next power-of-two of the given value.
|
|
*
|
|
* @param value The integer value to return power of 2
|
|
*
|
|
* @returns The next power of two
|
|
*
|
|
* WARNING: *NO* error checking is performed. This is meant to be a
|
|
* fast inline function.
|
|
* Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 77
|
|
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
|
|
*/
|
|
static inline int opal_next_poweroftwo(int value)
|
|
{
|
|
int power2;
|
|
|
|
#if OPAL_C_HAVE_BUILTIN_CLZ
|
|
if (OPAL_UNLIKELY (0 == value)) {
|
|
return 1;
|
|
}
|
|
power2 = 1 << (8 * sizeof (int) - __builtin_clz(value));
|
|
#else
|
|
for (power2 = 1; value > 0; value >>= 1, power2 <<= 1) /* empty */;
|
|
#endif
|
|
|
|
return power2;
|
|
}
|
|
|
|
|
|
/**
|
|
* @brief Returns next power-of-two of the given value (and the value itselve if already power-of-two).
|
|
*
|
|
* @param value The integer value to return power of 2
|
|
*
|
|
* @returns The next power of two (inclusive)
|
|
*
|
|
* WARNING: *NO* error checking is performed. This is meant to be a
|
|
* fast inline function.
|
|
* Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 56
|
|
* compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
|
|
*/
|
|
static inline int opal_next_poweroftwo_inclusive(int value)
|
|
{
|
|
int power2;
|
|
|
|
#if OPAL_C_HAVE_BUILTIN_CLZ
|
|
if (OPAL_UNLIKELY (1 >= value)) {
|
|
return 1;
|
|
}
|
|
power2 = 1 << (8 * sizeof (int) - __builtin_clz(value - 1));
|
|
#else
|
|
for (power2 = 1 ; power2 < value; power2 <<= 1) /* empty */;
|
|
#endif
|
|
|
|
return power2;
|
|
}
|
|
|
|
|
|
#endif /* OPAL_BIT_OPS_H */
|
|
|