[mirror_qemu.git] / util / bufferiszero.c

/*
 * Simple C functions to supplement the C library
 *
 * Copyright (c) 2006 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include "qemu/osdep.h"
#include "qemu/cutils.h"
#include "qemu/bswap.h"

static bool
buffer_zero_int(const void *buf, size_t len)
{
    if (unlikely(len < 8)) {
        /* For a very small buffer, simply accumulate all the bytes.  */
        const unsigned char *p = buf;
        const unsigned char *e = buf + len;
        unsigned char t = 0;

        do {
            t |= *p++;
        } while (p < e);

        return t == 0;
    } else {
        /* Otherwise, use the unaligned memory access functions to
           handle the beginning and end of the buffer, with a couple
           of loops handling the middle aligned section.  */
        uint64_t t = ldq_he_p(buf);
        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);

        for (; p + 8 <= e; p += 8) {
            __builtin_prefetch(p + 8);
            if (t) {
                return false;
            }
            t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
        }
        while (p < e) {
            t |= *p++;
        }
        t |= ldq_he_p(buf + len - 8);

        return t == 0;
    }
}

#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
/* Do not use push_options pragmas unnecessarily, because clang
 * does not support them.
 */
#ifdef CONFIG_AVX2_OPT
#pragma GCC push_options
#pragma GCC target("sse2")
#endif
#include <emmintrin.h>

/* Note that each of these vectorized functions require len >= 64.  */

static bool
buffer_zero_sse2(const void *buf, size_t len)
{
    __m128i t = _mm_loadu_si128(buf);
    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
    __m128i zero = _mm_setzero_si128();

    /* Loop over 16-byte aligned blocks of 64.  */
    while (likely(p <= e)) {
        __builtin_prefetch(p);
        t = _mm_cmpeq_epi8(t, zero);
        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    }

    /* Finish the aligned tail.  */
    t |= e[-3];
    t |= e[-2];
    t |= e[-1];

    /* Finish the unaligned tail.  */
    t |= _mm_loadu_si128(buf + len - 16);

    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
}
#ifdef CONFIG_AVX2_OPT
#pragma GCC pop_options
#endif

#ifdef CONFIG_AVX2_OPT
/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
 * the includes have to be within the corresponding push_options region, and
 * therefore the regions themselves have to be ordered with increasing ISA.
 */
#pragma GCC push_options
#pragma GCC target("sse4")
#include <smmintrin.h>

static bool
buffer_zero_sse4(const void *buf, size_t len)
{
    __m128i t = _mm_loadu_si128(buf);
    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);

    /* Loop over 16-byte aligned blocks of 64.  */
    while (likely(p <= e)) {
        __builtin_prefetch(p);
        if (unlikely(!_mm_testz_si128(t, t))) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    }

    /* Finish the aligned tail.  */
    t |= e[-3];
    t |= e[-2];
    t |= e[-1];

    /* Finish the unaligned tail.  */
    t |= _mm_loadu_si128(buf + len - 16);

    return _mm_testz_si128(t, t);
}

#pragma GCC pop_options
#pragma GCC push_options
#pragma GCC target("avx2")
#include <immintrin.h>

static bool
buffer_zero_avx2(const void *buf, size_t len)
{
    /* Begin with an unaligned head of 32 bytes.  */
    __m256i t = _mm256_loadu_si256(buf);
    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);

    if (likely(p <= e)) {
        /* Loop over 32-byte aligned blocks of 128.  */
        do {
            __builtin_prefetch(p);
            if (unlikely(!_mm256_testz_si256(t, t))) {
                return false;
            }
            t = p[-4] | p[-3] | p[-2] | p[-1];
            p += 4;
        } while (p <= e);
    } else {
        t |= _mm256_loadu_si256(buf + 32);
        if (len <= 128) {
            goto last2;
        }
    }

    /* Finish the last block of 128 unaligned.  */
    t |= _mm256_loadu_si256(buf + len - 4 * 32);
    t |= _mm256_loadu_si256(buf + len - 3 * 32);
 last2:
    t |= _mm256_loadu_si256(buf + len - 2 * 32);
    t |= _mm256_loadu_si256(buf + len - 1 * 32);

    return _mm256_testz_si256(t, t);
}
#pragma GCC pop_options
#endif /* CONFIG_AVX2_OPT */

/* Note that for test_buffer_is_zero_next_accel, the most preferred
 * ISA must have the least significant bit.
 */
#define CACHE_AVX2    1
#define CACHE_SSE4    2
#define CACHE_SSE2    4

/* Make sure that these variables are appropriately initialized when
 * SSE2 is enabled on the compiler command-line, but the compiler is
 * too old to support CONFIG_AVX2_OPT.
 */
#ifdef CONFIG_AVX2_OPT
# define INIT_CACHE 0
# define INIT_ACCEL buffer_zero_int
#else
# ifndef __SSE2__
#  error "ISA selection confusion"
# endif
# define INIT_CACHE CACHE_SSE2
# define INIT_ACCEL buffer_zero_sse2
#endif

static unsigned cpuid_cache = INIT_CACHE;
static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;

static void init_accel(unsigned cache)
{
    bool (*fn)(const void *, size_t) = buffer_zero_int;
    if (cache & CACHE_SSE2) {
        fn = buffer_zero_sse2;
    }
#ifdef CONFIG_AVX2_OPT
    if (cache & CACHE_SSE4) {
        fn = buffer_zero_sse4;
    }
    if (cache & CACHE_AVX2) {
        fn = buffer_zero_avx2;
    }
#endif
    buffer_accel = fn;
}

#ifdef CONFIG_AVX2_OPT
#include "qemu/cpuid.h"

static void __attribute__((constructor)) init_cpuid_cache(void)
{
    int max = __get_cpuid_max(0, NULL);
    int a, b, c, d;
    unsigned cache = 0;

    if (max >= 1) {
        __cpuid(1, a, b, c, d);
        if (d & bit_SSE2) {
            cache |= CACHE_SSE2;
        }
        if (c & bit_SSE4_1) {
            cache |= CACHE_SSE4;
        }

        /* We must check that AVX is not just available, but usable.  */
        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
            int bv;
            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
            __cpuid_count(7, 0, a, b, c, d);
            if ((bv & 6) == 6 && (b & bit_AVX2)) {
                cache |= CACHE_AVX2;
            }
        }
    }
    cpuid_cache = cache;
    init_accel(cache);
}
#endif /* CONFIG_AVX2_OPT */

bool test_buffer_is_zero_next_accel(void)
{
    /* If no bits set, we just tested buffer_zero_int, and there
       are no more acceleration options to test.  */
    if (cpuid_cache == 0) {
        return false;
    }
    /* Disable the accelerator we used before and select a new one.  */
    cpuid_cache &= cpuid_cache - 1;
    init_accel(cpuid_cache);
    return true;
}

static bool select_accel_fn(const void *buf, size_t len)
{
    if (likely(len >= 64)) {
        return buffer_accel(buf, len);
    }
    return buffer_zero_int(buf, len);
}

#else
#define select_accel_fn  buffer_zero_int
bool test_buffer_is_zero_next_accel(void)
{
    return false;
}
#endif

/*
 * Checks if a buffer is all zeroes
 */
bool buffer_is_zero(const void *buf, size_t len)
{
    if (unlikely(len == 0)) {
        return true;
    }

    /* Fetch the beginning of the buffer while we select the accelerator.  */
    __builtin_prefetch(buf);

    /* Use an optimized zero check if possible.  Note that this also
       includes a check for an unrolled loop over 64-bit integers.  */
    return select_accel_fn(buf, len);
}
Commit	Line	Data
88ca8e80 RH	1	/*
	2	* Simple C functions to supplement the C library
	3	*
	4	* Copyright (c) 2006 Fabrice Bellard
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24	#include "qemu/osdep.h"
88ca8e80	25	#include "qemu/cutils.h"
5e33a872	26	#include "qemu/bswap.h"
88ca8e80	27
5e33a872 RH	28	static bool
	29	buffer_zero_int(const void *buf, size_t len)
	30	{
	31	if (unlikely(len < 8)) {
	32	/* For a very small buffer, simply accumulate all the bytes. */
	33	const unsigned char *p = buf;
	34	const unsigned char *e = buf + len;
	35	unsigned char t = 0;
	36
	37	do {
	38	t \|= *p++;
	39	} while (p < e);
	40
	41	return t == 0;
	42	} else {
	43	/* Otherwise, use the unaligned memory access functions to
	44	handle the beginning and end of the buffer, with a couple
	45	of loops handling the middle aligned section. */
	46	uint64_t t = ldq_he_p(buf);
	47	const uint64_t p = (uint64_t )(((uintptr_t)buf + 8) & -8);
	48	const uint64_t e = (uint64_t )(((uintptr_t)buf + len) & -8);
	49
	50	for (; p + 8 <= e; p += 8) {
	51	__builtin_prefetch(p + 8);
	52	if (t) {
	53	return false;
	54	}
	55	t = p[0] \| p[1] \| p[2] \| p[3] \| p[4] \| p[5] \| p[6] \| p[7];
	56	}
	57	while (p < e) {
	58	t \|= *p++;
	59	}
	60	t \|= ldq_he_p(buf + len - 8);
	61
	62	return t == 0;
	63	}
	64	}
	65
d9911d14	66	#if defined(CONFIG_AVX2_OPT) \|\| defined(__SSE2__)
5e33a872 RH	67	/* Do not use push_options pragmas unnecessarily, because clang
	68	* does not support them.
	69	*/
d9911d14	70	#ifdef CONFIG_AVX2_OPT
5e33a872 RH	71	#pragma GCC push_options
	72	#pragma GCC target("sse2")
	73	#endif
	74	#include <emmintrin.h>
d9911d14 RH	75
	76	/* Note that each of these vectorized functions require len >= 64. */
	77
	78	static bool
	79	buffer_zero_sse2(const void *buf, size_t len)
	80	{
	81	__m128i t = _mm_loadu_si128(buf);
	82	__m128i p = (__m128i )(((uintptr_t)buf + 5 * 16) & -16);
	83	__m128i e = (__m128i )(((uintptr_t)buf + len) & -16);
	84	__m128i zero = _mm_setzero_si128();
	85
	86	/* Loop over 16-byte aligned blocks of 64. */
	87	while (likely(p <= e)) {
	88	__builtin_prefetch(p);
	89	t = _mm_cmpeq_epi8(t, zero);
	90	if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
	91	return false;
	92	}
	93	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	94	p += 4;
	95	}
	96
	97	/* Finish the aligned tail. */
	98	t \|= e[-3];
	99	t \|= e[-2];
	100	t \|= e[-1];
	101
	102	/* Finish the unaligned tail. */
	103	t \|= _mm_loadu_si128(buf + len - 16);
	104
	105	return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
	106	}
	107	#ifdef CONFIG_AVX2_OPT
5e33a872 RH	108	#pragma GCC pop_options
5e33a872 RH	109	#endif
88ca8e80	110
5e33a872	111	#ifdef CONFIG_AVX2_OPT
d9911d14 RH	112	/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
	113	* the includes have to be within the corresponding push_options region, and
	114	* therefore the regions themselves have to be ordered with increasing ISA.
	115	*/
86444f08 PB	116	#pragma GCC push_options
	117	#pragma GCC target("sse4")
	118	#include <smmintrin.h>
86444f08	119
d9911d14 RH	120	static bool
	121	buffer_zero_sse4(const void *buf, size_t len)
	122	{
	123	__m128i t = _mm_loadu_si128(buf);
	124	__m128i p = (__m128i )(((uintptr_t)buf + 5 * 16) & -16);
	125	__m128i e = (__m128i )(((uintptr_t)buf + len) & -16);
	126
	127	/* Loop over 16-byte aligned blocks of 64. */
	128	while (likely(p <= e)) {
	129	__builtin_prefetch(p);
	130	if (unlikely(!_mm_testz_si128(t, t))) {
	131	return false;
	132	}
	133	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	134	p += 4;
	135	}
	136
	137	/* Finish the aligned tail. */
	138	t \|= e[-3];
	139	t \|= e[-2];
	140	t \|= e[-1];
	141
	142	/* Finish the unaligned tail. */
	143	t \|= _mm_loadu_si128(buf + len - 16);
	144
	145	return _mm_testz_si128(t, t);
	146	}
	147
	148	#pragma GCC pop_options
88ca8e80 RH	149	#pragma GCC push_options
88ca8e80 RH	150	#pragma GCC target("avx2")
88ca8e80	151	#include <immintrin.h>
d9911d14 RH	152
	153	static bool
	154	buffer_zero_avx2(const void *buf, size_t len)
	155	{
	156	/* Begin with an unaligned head of 32 bytes. */
	157	__m256i t = _mm256_loadu_si256(buf);
	158	__m256i p = (__m256i )(((uintptr_t)buf + 5 * 32) & -32);
	159	__m256i e = (__m256i )(((uintptr_t)buf + len) & -32);
	160
	161	if (likely(p <= e)) {
	162	/* Loop over 32-byte aligned blocks of 128. */
	163	do {
	164	__builtin_prefetch(p);
	165	if (unlikely(!_mm256_testz_si256(t, t))) {
	166	return false;
	167	}
	168	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	169	p += 4;
	170	} while (p <= e);
	171	} else {
	172	t \|= _mm256_loadu_si256(buf + 32);
	173	if (len <= 128) {
	174	goto last2;
	175	}
	176	}
	177
	178	/* Finish the last block of 128 unaligned. */
	179	t \|= _mm256_loadu_si256(buf + len - 4 * 32);
	180	t \|= _mm256_loadu_si256(buf + len - 3 * 32);
	181	last2:
	182	t \|= _mm256_loadu_si256(buf + len - 2 * 32);
	183	t \|= _mm256_loadu_si256(buf + len - 1 * 32);
	184
	185	return _mm256_testz_si256(t, t);
	186	}
5e33a872	187	#pragma GCC pop_options
d9911d14 RH	188	#endif /* CONFIG_AVX2_OPT */
	189
	190	/* Note that for test_buffer_is_zero_next_accel, the most preferred
	191	* ISA must have the least significant bit.
	192	*/
	193	#define CACHE_AVX2 1
	194	#define CACHE_SSE4 2
	195	#define CACHE_SSE2 4
	196
	197	/* Make sure that these variables are appropriately initialized when
	198	* SSE2 is enabled on the compiler command-line, but the compiler is
5dd89908	199	* too old to support CONFIG_AVX2_OPT.
d9911d14 RH	200	*/
	201	#ifdef CONFIG_AVX2_OPT
	202	# define INIT_CACHE 0
	203	# define INIT_ACCEL buffer_zero_int
	204	#else
	205	# ifndef __SSE2__
	206	# error "ISA selection confusion"
	207	# endif
	208	# define INIT_CACHE CACHE_SSE2
	209	# define INIT_ACCEL buffer_zero_sse2
5e33a872	210	#endif
88ca8e80	211
d9911d14 RH	212	static unsigned cpuid_cache = INIT_CACHE;
d9911d14 RH	213	static bool (buffer_accel)(const void , size_t) = INIT_ACCEL;
88ca8e80	214
d9911d14 RH	215	static void init_accel(unsigned cache)
	216	{
	217	bool (fn)(const void , size_t) = buffer_zero_int;
	218	if (cache & CACHE_SSE2) {
	219	fn = buffer_zero_sse2;
	220	}
	221	#ifdef CONFIG_AVX2_OPT
	222	if (cache & CACHE_SSE4) {
	223	fn = buffer_zero_sse4;
	224	}
	225	if (cache & CACHE_AVX2) {
	226	fn = buffer_zero_avx2;
	227	}
	228	#endif
	229	buffer_accel = fn;
	230	}
88ca8e80	231
d9911d14	232	#ifdef CONFIG_AVX2_OPT
5dd89908 RH	233	#include "qemu/cpuid.h"
5dd89908 RH	234
5e33a872	235	static void __attribute__((constructor)) init_cpuid_cache(void)
88ca8e80	236	{
5e33a872 RH	237	int max = __get_cpuid_max(0, NULL);
	238	int a, b, c, d;
	239	unsigned cache = 0;
88ca8e80	240
5e33a872 RH	241	if (max >= 1) {
	242	__cpuid(1, a, b, c, d);
	243	if (d & bit_SSE2) {
	244	cache \|= CACHE_SSE2;
	245	}
5e33a872 RH	246	if (c & bit_SSE4_1) {
5e33a872 RH	247	cache \|= CACHE_SSE4;
88ca8e80	248	}
88ca8e80	249
5e33a872	250	/* We must check that AVX is not just available, but usable. */
d9911d14 RH	251	if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
	252	int bv;
	253	__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
	254	__cpuid_count(7, 0, a, b, c, d);
	255	if ((bv & 6) == 6 && (b & bit_AVX2)) {
	256	cache \|= CACHE_AVX2;
5e33a872	257	}
88ca8e80 RH	258	}
88ca8e80 RH	259	}
5e33a872	260	cpuid_cache = cache;
d9911d14	261	init_accel(cache);
88ca8e80	262	}
d9911d14	263	#endif /* CONFIG_AVX2_OPT */
88ca8e80	264
efad6682 RH	265	bool test_buffer_is_zero_next_accel(void)
	266	{
	267	/* If no bits set, we just tested buffer_zero_int, and there
	268	are no more acceleration options to test. */
	269	if (cpuid_cache == 0) {
	270	return false;
	271	}
	272	/* Disable the accelerator we used before and select a new one. */
	273	cpuid_cache &= cpuid_cache - 1;
d9911d14	274	init_accel(cpuid_cache);
efad6682 RH	275	return true;
	276	}
	277
5e33a872	278	static bool select_accel_fn(const void *buf, size_t len)
88ca8e80	279	{
d9911d14 RH	280	if (likely(len >= 64)) {
d9911d14 RH	281	return buffer_accel(buf, len);
5e33a872 RH	282	}
5e33a872 RH	283	return buffer_zero_int(buf, len);
88ca8e80 RH	284	}
88ca8e80 RH	285
5e33a872 RH	286	#else
5e33a872 RH	287	#define select_accel_fn buffer_zero_int
efad6682 RH	288	bool test_buffer_is_zero_next_accel(void)
	289	{
	290	return false;
	291	}
	292	#endif
	293
88ca8e80 RH	294	/*
88ca8e80 RH	295	* Checks if a buffer is all zeroes
88ca8e80 RH	296	*/
	297	bool buffer_is_zero(const void *buf, size_t len)
	298	{
5e33a872 RH	299	if (unlikely(len == 0)) {
5e33a872 RH	300	return true;
88ca8e80 RH	301	}
88ca8e80 RH	302
083d012a RH	303	/* Fetch the beginning of the buffer while we select the accelerator. */
	304	__builtin_prefetch(buf);
	305
5e33a872 RH	306	/* Use an optimized zero check if possible. Note that this also
	307	includes a check for an unrolled loop over 64-bit integers. */
	308	return select_accel_fn(buf, len);
88ca8e80	309	}