}
#endif
-/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
-static unsigned int
-count_1bits_32(uint32_t x)
-{
- /* In my testing, this implementation is over twice as fast as any other
- * portable implementation that I tried, including GCC 4.4
- * __builtin_popcount(), although nonportable asm("popcnt") was over 50%
- * faster. */
+#if !(__GNUC__ >= 4 && defined(__corei7))
#define INIT1(X) \
((((X) & (1 << 0)) != 0) + \
(((X) & (1 << 1)) != 0) + \
#define INIT32(X) INIT16(X), INIT16((X) + 16)
#define INIT64(X) INIT32(X), INIT32((X) + 32)
- static const uint8_t count_1bits_8[256] = {
- INIT64(0), INIT64(64), INIT64(128), INIT64(192)
- };
-
- return (count_1bits_8[x & 0xff] +
- count_1bits_8[(x >> 8) & 0xff] +
- count_1bits_8[(x >> 16) & 0xff] +
- count_1bits_8[x >> 24]);
-}
-
-/* Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */
-unsigned int
-count_1bits(uint64_t x)
-{
- return count_1bits_32(x) + count_1bits_32(x >> 32);
-}
+const uint8_t count_1bits_8[256] = {
+ INIT64(0), INIT64(64), INIT64(128), INIT64(192)
+};
+#endif
/* Returns true if the 'n' bytes starting at 'p' are zeros. */
bool
return log_2_floor(n) + !is_pow2(n);
}
-unsigned int count_1bits(uint64_t);
+/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
+static inline unsigned int
+count_1bits_32(uint32_t x)
+{
+#if __GNUC__ >= 4 && defined(__corei7)
+ /* __builtin_popcount() is fast only when supported by the CPU. */
+ return __builtin_popcount(x);
+#else
+ extern const uint8_t count_1bits_8[256];
+ /* This portable implementation is the fastest one we know of for 32 bits,
+ * and faster than GCC __builtin_popcount(). */
+ return (count_1bits_8[x & 0xff] +
+ count_1bits_8[(x >> 8) & 0xff] +
+ count_1bits_8[(x >> 16) & 0xff] +
+ count_1bits_8[x >> 24]);
+#endif
+}
+
+/* Returns the number of 1-bits in 'x', between 0 and 64 inclusive. */
+static inline unsigned int
+count_1bits(uint64_t x)
+{
+ if (sizeof(void *) == 8) { /* 64-bit CPU */
+#if __GNUC__ >= 4 && defined(__corei7)
+ /* __builtin_popcountll() is fast only when supported by the CPU. */
+ return __builtin_popcountll(x);
+#else
+ /* This portable implementation is the fastest one we know of for 64
+ * bits, and about 3x faster than GCC 4.7 __builtin_popcountll(). */
+ const uint64_t h55 = UINT64_C(0x5555555555555555);
+ const uint64_t h33 = UINT64_C(0x3333333333333333);
+ const uint64_t h0F = UINT64_C(0x0F0F0F0F0F0F0F0F);
+ const uint64_t h01 = UINT64_C(0x0101010101010101);
+ x -= (x >> 1) & h55; /* Count of each 2 bits in-place. */
+ x = (x & h33) + ((x >> 2) & h33); /* Count of each 4 bits in-place. */
+ x = (x + (x >> 4)) & h0F; /* Count of each 8 bits in-place. */
+ return (x * h01) >> 56; /* Sum of all bytes. */
+#endif
+ } else { /* 32-bit CPU */
+ return count_1bits_32(x) + count_1bits_32(x >> 32);
+ }
+}
/* Returns the rightmost 1-bit in 'x' (e.g. 01011000 => 00001000), or 0 if 'x'
* is 0. */