From c3cc4d2dd265823828d93278b5d1ead4faad85ba Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Wed, 27 Nov 2013 12:58:46 -0800 Subject: [PATCH] util: Better count_1bits(). Inline, use another well-known algorithm for 64-bit builds, and use builtins when they are known to be fast at compile time. A 32-bit version of the alternate algorithm is slower than the existing implementation, so the old one is used for 32-bit builds. Inline assembler would be a bit faster on 32-bit i7 build, but we use the GCC builtin for portability. It should be stressed builds for specific CPUs do not work on others CPUs, and that OVS build system or runtime does not currently support CPU detection. Speed improvement v.s. existing implementation / GCC 4.7 __builtin_popcountll(): i386: 64% (inlining) / 380% i386 on i7: 240% (inlining + builtin) / 820% x86_64: 59% (inlining + different algorithm) / 190% x86_64 on i7: 370% (inlining + builtin) / 0% Signed-off-by: Jarno Rajahalme Acked-by: Ben Pfaff --- lib/util.c | 29 +++++------------------------ lib/util.h | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/lib/util.c b/lib/util.c index 76f4cd6a0..13d41a70d 100644 --- a/lib/util.c +++ b/lib/util.c @@ -901,14 +901,7 @@ raw_clz64(uint64_t n) } #endif -/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ -static unsigned int -count_1bits_32(uint32_t x) -{ - /* In my testing, this implementation is over twice as fast as any other - * portable implementation that I tried, including GCC 4.4 - * __builtin_popcount(), although nonportable asm("popcnt") was over 50% - * faster. */ +#if !(__GNUC__ >= 4 && defined(__corei7)) #define INIT1(X) \ ((((X) & (1 << 0)) != 0) + \ (((X) & (1 << 1)) != 0) + \ @@ -925,22 +918,10 @@ count_1bits_32(uint32_t x) #define INIT32(X) INIT16(X), INIT16((X) + 16) #define INIT64(X) INIT32(X), INIT32((X) + 32) - static const uint8_t count_1bits_8[256] = { - INIT64(0), INIT64(64), INIT64(128), INIT64(192) - }; - - return (count_1bits_8[x & 0xff] + - count_1bits_8[(x >> 8) & 0xff] + - count_1bits_8[(x >> 16) & 0xff] + - count_1bits_8[x >> 24]); -} - -/* Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */ -unsigned int -count_1bits(uint64_t x) -{ - return count_1bits_32(x) + count_1bits_32(x >> 32); -} +const uint8_t count_1bits_8[256] = { + INIT64(0), INIT64(64), INIT64(128), INIT64(192) +}; +#endif /* Returns true if the 'n' bytes starting at 'p' are zeros. */ bool diff --git a/lib/util.h b/lib/util.h index 5c23962c8..7c5eacbd8 100644 --- a/lib/util.h +++ b/lib/util.h @@ -371,7 +371,48 @@ log_2_ceil(uint64_t n) return log_2_floor(n) + !is_pow2(n); } -unsigned int count_1bits(uint64_t); +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ +static inline unsigned int +count_1bits_32(uint32_t x) +{ +#if __GNUC__ >= 4 && defined(__corei7) + /* __builtin_popcount() is fast only when supported by the CPU. */ + return __builtin_popcount(x); +#else + extern const uint8_t count_1bits_8[256]; + /* This portable implementation is the fastest one we know of for 32 bits, + * and faster than GCC __builtin_popcount(). */ + return (count_1bits_8[x & 0xff] + + count_1bits_8[(x >> 8) & 0xff] + + count_1bits_8[(x >> 16) & 0xff] + + count_1bits_8[x >> 24]); +#endif +} + +/* Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */ +static inline unsigned int +count_1bits(uint64_t x) +{ + if (sizeof(void *) == 8) { /* 64-bit CPU */ +#if __GNUC__ >= 4 && defined(__corei7) + /* __builtin_popcountll() is fast only when supported by the CPU. */ + return __builtin_popcountll(x); +#else + /* This portable implementation is the fastest one we know of for 64 + * bits, and about 3x faster than GCC 4.7 __builtin_popcountll(). */ + const uint64_t h55 = UINT64_C(0x5555555555555555); + const uint64_t h33 = UINT64_C(0x3333333333333333); + const uint64_t h0F = UINT64_C(0x0F0F0F0F0F0F0F0F); + const uint64_t h01 = UINT64_C(0x0101010101010101); + x -= (x >> 1) & h55; /* Count of each 2 bits in-place. */ + x = (x & h33) + ((x >> 2) & h33); /* Count of each 4 bits in-place. */ + x = (x + (x >> 4)) & h0F; /* Count of each 8 bits in-place. */ + return (x * h01) >> 56; /* Sum of all bytes. */ +#endif + } else { /* 32-bit CPU */ + return count_1bits_32(x) + count_1bits_32(x >> 32); + } +} /* Returns the rightmost 1-bit in 'x' (e.g. 01011000 => 00001000), or 0 if 'x' * is 0. */ -- 2.47.0