* THE SOFTWARE.
*/
#include "qemu/osdep.h"
-#include "qemu-common.h"
#include "qemu/cutils.h"
+#include "qemu/bswap.h"
+static bool
+buffer_zero_int(const void *buf, size_t len)
+{
+ if (unlikely(len < 8)) {
+ /* For a very small buffer, simply accumulate all the bytes. */
+ const unsigned char *p = buf;
+ const unsigned char *e = buf + len;
+ unsigned char t = 0;
+
+ do {
+ t |= *p++;
+ } while (p < e);
+
+ return t == 0;
+ } else {
+ /* Otherwise, use the unaligned memory access functions to
+ handle the beginning and end of the buffer, with a couple
+ of loops handling the middle aligned section. */
+ uint64_t t = ldq_he_p(buf);
+ const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
+ const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
+
+ for (; p + 8 <= e; p += 8) {
+ __builtin_prefetch(p + 8);
+ if (t) {
+ return false;
+ }
+ t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
+ }
+ while (p < e) {
+ t |= *p++;
+ }
+ t |= ldq_he_p(buf + len - 8);
-/* vector definitions */
-#ifdef __ALTIVEC__
-#include <altivec.h>
-/* The altivec.h header says we're allowed to undef these for
- * C++ compatibility. Here we don't care about C++, but we
- * undef them anyway to avoid namespace pollution.
+ return t == 0;
+ }
+}
+
+#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
+/* Do not use push_options pragmas unnecessarily, because clang
+ * does not support them.
*/
-#undef vector
-#undef pixel
-#undef bool
-#define VECTYPE __vector unsigned char
-#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
-#define VEC_OR(v1, v2) ((v1) | (v2))
-/* altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics. */
-#define bool _Bool
-#elif defined __SSE2__
-#include <emmintrin.h>
-#define VECTYPE __m128i
-#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
-#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
-#elif defined(__aarch64__)
-#include "arm_neon.h"
-#define VECTYPE uint64x2_t
-#define ALL_EQ(v1, v2) \
- ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
- (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
-#define VEC_OR(v1, v2) ((v1) | (v2))
-#else
-#define VECTYPE unsigned long
-#define ALL_EQ(v1, v2) ((v1) == (v2))
-#define VEC_OR(v1, v2) ((v1) | (v2))
+#ifdef CONFIG_AVX2_OPT
+#pragma GCC push_options
+#pragma GCC target("sse2")
#endif
+#include <emmintrin.h>
-#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
+/* Note that each of these vectorized functions require len >= 64. */
static bool
-can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+buffer_zero_sse2(const void *buf, size_t len)
{
- return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
- * sizeof(VECTYPE)) == 0
- && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+ __m128i zero = _mm_setzero_si128();
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ t = _mm_cmpeq_epi8(t, zero);
+ if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
-/*
- * Searches for an area with non-zero content in a buffer
- *
- * Attention! The len must be a multiple of
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * and addr must be a multiple of sizeof(VECTYPE) due to
- * restriction of optimizations in this function.
- *
- * can_use_buffer_find_nonzero_offset_inner() can be used to
- * check these requirements.
- *
- * The return value is the offset of the non-zero area rounded
- * down to a multiple of sizeof(VECTYPE) for the first
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * afterwards.
- *
- * If the buffer is all zero the return value is equal to len.
- */
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
-static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
-{
- const VECTYPE *p = buf;
- const VECTYPE zero = (VECTYPE){0};
- size_t i;
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
- assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
+ return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
+#ifdef CONFIG_AVX2_OPT
+#pragma GCC pop_options
+#endif
- if (!len) {
- return 0;
- }
+#ifdef CONFIG_AVX2_OPT
+/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
+ * the includes have to be within the corresponding push_options region, and
+ * therefore the regions themselves have to be ordered with increasing ISA.
+ */
+#pragma GCC push_options
+#pragma GCC target("sse4")
+#include <smmintrin.h>
- for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
- if (!ALL_EQ(p[i], zero)) {
- return i * sizeof(VECTYPE);
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm_testz_si128(t, t))) {
+ return false;
}
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
}
- for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
- i < len / sizeof(VECTYPE);
- i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
- VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
- VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
- VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
- VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
- VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
- VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
- if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
- break;
- }
- }
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
+
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
- return i * sizeof(VECTYPE);
+ return _mm_testz_si128(t, t);
}
-#if defined CONFIG_AVX2_OPT
+#pragma GCC pop_options
#pragma GCC push_options
#pragma GCC target("avx2")
-#include <cpuid.h>
#include <immintrin.h>
-#define AVX2_VECTYPE __m256i
-#define AVX2_ALL_EQ(v1, v2) \
- (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
-#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
-
static bool
-can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+buffer_zero_avx2(const void *buf, size_t len)
{
- return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
- * sizeof(AVX2_VECTYPE)) == 0
- && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
-}
-
-static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
-{
- const AVX2_VECTYPE *p = buf;
- const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
- size_t i;
-
- assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
-
- if (!len) {
- return 0;
- }
-
- for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
- if (!AVX2_ALL_EQ(p[i], zero)) {
- return i * sizeof(AVX2_VECTYPE);
+ /* Begin with an unaligned head of 32 bytes. */
+ __m256i t = _mm256_loadu_si256(buf);
+ __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+ __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+ if (likely(p <= e)) {
+ /* Loop over 32-byte aligned blocks of 128. */
+ do {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm256_testz_si256(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ } while (p <= e);
+ } else {
+ t |= _mm256_loadu_si256(buf + 32);
+ if (len <= 128) {
+ goto last2;
}
}
- for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
- i < len / sizeof(AVX2_VECTYPE);
- i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
- AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
- AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
- AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
- AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
- AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
- AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
- if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
- break;
- }
- }
+ /* Finish the last block of 128 unaligned. */
+ t |= _mm256_loadu_si256(buf + len - 4 * 32);
+ t |= _mm256_loadu_si256(buf + len - 3 * 32);
+ last2:
+ t |= _mm256_loadu_si256(buf + len - 2 * 32);
+ t |= _mm256_loadu_si256(buf + len - 1 * 32);
- return i * sizeof(AVX2_VECTYPE);
+ return _mm256_testz_si256(t, t);
}
+#pragma GCC pop_options
+#endif /* CONFIG_AVX2_OPT */
-static bool avx2_support(void)
-{
- int a, b, c, d;
+/* Note that for test_buffer_is_zero_next_accel, the most preferred
+ * ISA must have the least significant bit.
+ */
+#define CACHE_AVX2 1
+#define CACHE_SSE4 2
+#define CACHE_SSE2 4
- if (__get_cpuid_max(0, NULL) < 7) {
- return false;
- }
+/* Make sure that these variables are appropriately initialized when
+ * SSE2 is enabled on the compiler command-line, but the compiler is
+ * too old to support CONFIG_AVX2_OPT.
+ */
+#ifdef CONFIG_AVX2_OPT
+# define INIT_CACHE 0
+# define INIT_ACCEL buffer_zero_int
+#else
+# ifndef __SSE2__
+# error "ISA selection confusion"
+# endif
+# define INIT_CACHE CACHE_SSE2
+# define INIT_ACCEL buffer_zero_sse2
+#endif
- __cpuid_count(7, 0, a, b, c, d);
+static unsigned cpuid_cache = INIT_CACHE;
+static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
- return b & bit_AVX2;
+static void init_accel(unsigned cache)
+{
+ bool (*fn)(const void *, size_t) = buffer_zero_int;
+ if (cache & CACHE_SSE2) {
+ fn = buffer_zero_sse2;
+ }
+#ifdef CONFIG_AVX2_OPT
+ if (cache & CACHE_SSE4) {
+ fn = buffer_zero_sse4;
+ }
+ if (cache & CACHE_AVX2) {
+ fn = buffer_zero_avx2;
+ }
+#endif
+ buffer_accel = fn;
}
-static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
- __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
-static size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
- __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+#ifdef CONFIG_AVX2_OPT
+#include "qemu/cpuid.h"
-static void *buffer_find_nonzero_offset_ifunc(void)
+static void __attribute__((constructor)) init_cpuid_cache(void)
{
- typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
- buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+ int max = __get_cpuid_max(0, NULL);
+ int a, b, c, d;
+ unsigned cache = 0;
- return func;
+ if (max >= 1) {
+ __cpuid(1, a, b, c, d);
+ if (d & bit_SSE2) {
+ cache |= CACHE_SSE2;
+ }
+ if (c & bit_SSE4_1) {
+ cache |= CACHE_SSE4;
+ }
+
+ /* We must check that AVX is not just available, but usable. */
+ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+ int bv;
+ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+ __cpuid_count(7, 0, a, b, c, d);
+ if ((bv & 6) == 6 && (b & bit_AVX2)) {
+ cache |= CACHE_AVX2;
+ }
+ }
+ }
+ cpuid_cache = cache;
+ init_accel(cache);
}
+#endif /* CONFIG_AVX2_OPT */
-static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+bool test_buffer_is_zero_next_accel(void)
{
- typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
- can_use_buffer_find_nonzero_offset_avx2 :
- can_use_buffer_find_nonzero_offset_inner;
-
- return func;
+ /* If no bits set, we just tested buffer_zero_int, and there
+ are no more acceleration options to test. */
+ if (cpuid_cache == 0) {
+ return false;
+ }
+ /* Disable the accelerator we used before and select a new one. */
+ cpuid_cache &= cpuid_cache - 1;
+ init_accel(cpuid_cache);
+ return true;
}
-#pragma GCC pop_options
-#else
-static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+
+static bool select_accel_fn(const void *buf, size_t len)
{
- return can_use_buffer_find_nonzero_offset_inner(buf, len);
+ if (likely(len >= 64)) {
+ return buffer_accel(buf, len);
+ }
+ return buffer_zero_int(buf, len);
}
-static size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+#else
+#define select_accel_fn buffer_zero_int
+bool test_buffer_is_zero_next_accel(void)
{
- return buffer_find_nonzero_offset_inner(buf, len);
+ return false;
}
#endif
/*
* Checks if a buffer is all zeroes
- *
- * Attention! The len must be a multiple of 4 * sizeof(long) due to
- * restriction of optimizations in this function.
*/
bool buffer_is_zero(const void *buf, size_t len)
{
- /*
- * Use long as the biggest available internal data type that fits into the
- * CPU register and unroll the loop to smooth out the effect of memory
- * latency.
- */
-
- size_t i;
- long d0, d1, d2, d3;
- const long * const data = buf;
-
- /* use vector optimized zero check if possible */
- if (can_use_buffer_find_nonzero_offset(buf, len)) {
- return buffer_find_nonzero_offset(buf, len) == len;
+ if (unlikely(len == 0)) {
+ return true;
}
- assert(len % (4 * sizeof(long)) == 0);
- len /= sizeof(long);
-
- for (i = 0; i < len; i += 4) {
- d0 = data[i + 0];
- d1 = data[i + 1];
- d2 = data[i + 2];
- d3 = data[i + 3];
-
- if (d0 || d1 || d2 || d3) {
- return false;
- }
- }
+ /* Fetch the beginning of the buffer while we select the accelerator. */
+ __builtin_prefetch(buf);
- return true;
+ /* Use an optimized zero check if possible. Note that this also
+ includes a check for an unrolled loop over 64-bit integers. */
+ return select_accel_fn(buf, len);
}
-