From 381657b3ea69ea50c1dd43588f795fa7a0459ecb Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 12 Dec 2013 08:27:41 -0800 Subject: [PATCH] lib/util: More portable use of builtin popcnt. - Use the GCC predefined macro __POPCNT__ to detect the availability of fast __builtin_popcnt function. - Use portable preprocessor macros to detect 64-bit build. - Only define the 32-bit parts when needed and declare the count_1bits_8 at file scope to silence a warning. This time I have tested all code paths to make sure no warnigns are generated. Signed-off-by: Jarno Rajahalme Reviewed-by: Simon Horman --- lib/util.c | 2 +- lib/util.h | 62 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/lib/util.c b/lib/util.c index 13d41a70d..000504cff 100644 --- a/lib/util.c +++ b/lib/util.c @@ -901,7 +901,7 @@ raw_clz64(uint64_t n) } #endif -#if !(__GNUC__ >= 4 && defined(__corei7)) +#if NEED_COUNT_1BITS_8 #define INIT1(X) \ ((((X) & (1 << 0)) != 0) + \ (((X) & (1 << 1)) != 0) + \ diff --git a/lib/util.h b/lib/util.h index 8d810c2b7..0327ab0f2 100644 --- a/lib/util.h +++ b/lib/util.h @@ -371,49 +371,55 @@ log_2_ceil(uint64_t n) return log_2_floor(n) + !is_pow2(n); } -extern const uint8_t count_1bits_8[256]; - -/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ +/* unsigned int count_1bits(uint64_t x): + * + * Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */ +#if UINTPTR_MAX == UINT64_MAX +static inline unsigned int +count_1bits(uint64_t x) +{ +#if __GNUC__ >= 4 && __POPCNT__ + return __builtin_popcountll(x); +#else + /* This portable implementation is the fastest one we know of for 64 + * bits, and about 3x faster than GCC 4.7 __builtin_popcountll(). */ + const uint64_t h55 = UINT64_C(0x5555555555555555); + const uint64_t h33 = UINT64_C(0x3333333333333333); + const uint64_t h0F = UINT64_C(0x0F0F0F0F0F0F0F0F); + const uint64_t h01 = UINT64_C(0x0101010101010101); + x -= (x >> 1) & h55; /* Count of each 2 bits in-place. */ + x = (x & h33) + ((x >> 2) & h33); /* Count of each 4 bits in-place. */ + x = (x + (x >> 4)) & h0F; /* Count of each 8 bits in-place. */ + return (x * h01) >> 56; /* Sum of all bytes. */ +#endif +} +#else /* Not 64-bit. */ +#if __GNUC__ >= 4 && __POPCNT__ static inline unsigned int -count_1bits_32(uint32_t x) +count_1bits_32__(uint32_t x) { -#if __GNUC__ >= 4 && defined(__corei7) - /* __builtin_popcount() is fast only when supported by the CPU. */ return __builtin_popcount(x); +} #else +#define NEED_COUNT_1BITS_8 1 +extern const uint8_t count_1bits_8[256]; +static inline unsigned int +count_1bits_32__(uint32_t x) +{ /* This portable implementation is the fastest one we know of for 32 bits, * and faster than GCC __builtin_popcount(). */ return (count_1bits_8[x & 0xff] + count_1bits_8[(x >> 8) & 0xff] + count_1bits_8[(x >> 16) & 0xff] + count_1bits_8[x >> 24]); -#endif } - -/* Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */ +#endif static inline unsigned int count_1bits(uint64_t x) { - if (sizeof(void *) == 8) { /* 64-bit CPU */ -#if __GNUC__ >= 4 && defined(__corei7) - /* __builtin_popcountll() is fast only when supported by the CPU. */ - return __builtin_popcountll(x); -#else - /* This portable implementation is the fastest one we know of for 64 - * bits, and about 3x faster than GCC 4.7 __builtin_popcountll(). */ - const uint64_t h55 = UINT64_C(0x5555555555555555); - const uint64_t h33 = UINT64_C(0x3333333333333333); - const uint64_t h0F = UINT64_C(0x0F0F0F0F0F0F0F0F); - const uint64_t h01 = UINT64_C(0x0101010101010101); - x -= (x >> 1) & h55; /* Count of each 2 bits in-place. */ - x = (x & h33) + ((x >> 2) & h33); /* Count of each 4 bits in-place. */ - x = (x + (x >> 4)) & h0F; /* Count of each 8 bits in-place. */ - return (x * h01) >> 56; /* Sum of all bytes. */ -#endif - } else { /* 32-bit CPU */ - return count_1bits_32(x) + count_1bits_32(x >> 32); - } + return count_1bits_32__(x) + count_1bits_32__(x >> 32); } +#endif /* Returns the rightmost 1-bit in 'x' (e.g. 01011000 => 00001000), or 0 if 'x' * is 0. */ -- 2.43.0