[mirror_qemu.git] / util / bufferiszero.c

/*
 * Simple C functions to supplement the C library
 *
 * Copyright (c) 2006 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include "qemu/osdep.h"
#include "qemu/cutils.h"
#include "qemu/bswap.h"

static bool
buffer_zero_int(const void *buf, size_t len)
{
    if (unlikely(len < 8)) {
        /* For a very small buffer, simply accumulate all the bytes.  */
        const unsigned char *p = buf;
        const unsigned char *e = buf + len;
        unsigned char t = 0;

        do {
            t |= *p++;
        } while (p < e);

        return t == 0;
    } else {
        /* Otherwise, use the unaligned memory access functions to
           handle the beginning and end of the buffer, with a couple
           of loops handling the middle aligned section.  */
        uint64_t t = ldq_he_p(buf);
        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);

        for (; p + 8 <= e; p += 8) {
            __builtin_prefetch(p + 8);
            if (t) {
                return false;
            }
            t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
        }
        while (p < e) {
            t |= *p++;
        }
        t |= ldq_he_p(buf + len - 8);

        return t == 0;
    }
}

#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
/* Do not use push_options pragmas unnecessarily, because clang
 * does not support them.
 */
#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
#pragma GCC push_options
#pragma GCC target("sse2")
#endif
#include <emmintrin.h>

/* Note that each of these vectorized functions require len >= 64.  */

static bool
buffer_zero_sse2(const void *buf, size_t len)
{
    __m128i t = _mm_loadu_si128(buf);
    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
    __m128i zero = _mm_setzero_si128();

    /* Loop over 16-byte aligned blocks of 64.  */
    while (likely(p <= e)) {
        __builtin_prefetch(p);
        t = _mm_cmpeq_epi8(t, zero);
        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    }

    /* Finish the aligned tail.  */
    t |= e[-3];
    t |= e[-2];
    t |= e[-1];

    /* Finish the unaligned tail.  */
    t |= _mm_loadu_si128(buf + len - 16);

    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
}
#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
#pragma GCC pop_options
#endif

#ifdef CONFIG_AVX2_OPT
/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
 * the includes have to be within the corresponding push_options region, and
 * therefore the regions themselves have to be ordered with increasing ISA.
 */
#pragma GCC push_options
#pragma GCC target("sse4")
#include <smmintrin.h>

static bool
buffer_zero_sse4(const void *buf, size_t len)
{
    __m128i t = _mm_loadu_si128(buf);
    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);

    /* Loop over 16-byte aligned blocks of 64.  */
    while (likely(p <= e)) {
        __builtin_prefetch(p);
        if (unlikely(!_mm_testz_si128(t, t))) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    }

    /* Finish the aligned tail.  */
    t |= e[-3];
    t |= e[-2];
    t |= e[-1];

    /* Finish the unaligned tail.  */
    t |= _mm_loadu_si128(buf + len - 16);

    return _mm_testz_si128(t, t);
}

#pragma GCC pop_options
#pragma GCC push_options
#pragma GCC target("avx2")
#include <immintrin.h>

static bool
buffer_zero_avx2(const void *buf, size_t len)
{
    /* Begin with an unaligned head of 32 bytes.  */
    __m256i t = _mm256_loadu_si256(buf);
    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);

    /* Loop over 32-byte aligned blocks of 128.  */
    while (p <= e) {
        __builtin_prefetch(p);
        if (unlikely(!_mm256_testz_si256(t, t))) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    } ;

    /* Finish the last block of 128 unaligned.  */
    t |= _mm256_loadu_si256(buf + len - 4 * 32);
    t |= _mm256_loadu_si256(buf + len - 3 * 32);
    t |= _mm256_loadu_si256(buf + len - 2 * 32);
    t |= _mm256_loadu_si256(buf + len - 1 * 32);

    return _mm256_testz_si256(t, t);
}
#pragma GCC pop_options
#endif /* CONFIG_AVX2_OPT */

#ifdef CONFIG_AVX512F_OPT
#pragma GCC push_options
#pragma GCC target("avx512f")
#include <immintrin.h>

static bool
buffer_zero_avx512(const void *buf, size_t len)
{
    /* Begin with an unaligned head of 64 bytes.  */
    __m512i t = _mm512_loadu_si512(buf);
    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);

    /* Loop over 64-byte aligned blocks of 256.  */
    while (p <= e) {
        __builtin_prefetch(p);
        if (unlikely(_mm512_test_epi64_mask(t, t))) {
            return false;
        }
        t = p[-4] | p[-3] | p[-2] | p[-1];
        p += 4;
    }

    t |= _mm512_loadu_si512(buf + len - 4 * 64);
    t |= _mm512_loadu_si512(buf + len - 3 * 64);
    t |= _mm512_loadu_si512(buf + len - 2 * 64);
    t |= _mm512_loadu_si512(buf + len - 1 * 64);

    return !_mm512_test_epi64_mask(t, t);

}
#pragma GCC pop_options
#endif


/* Note that for test_buffer_is_zero_next_accel, the most preferred
 * ISA must have the least significant bit.
 */
#define CACHE_AVX512F 1
#define CACHE_AVX2    2
#define CACHE_SSE4    4
#define CACHE_SSE2    8

/* Make sure that these variables are appropriately initialized when
 * SSE2 is enabled on the compiler command-line, but the compiler is
 * too old to support CONFIG_AVX2_OPT.
 */
#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
# define INIT_CACHE 0
# define INIT_ACCEL buffer_zero_int
#else
# ifndef __SSE2__
#  error "ISA selection confusion"
# endif
# define INIT_CACHE CACHE_SSE2
# define INIT_ACCEL buffer_zero_sse2
#endif

static unsigned cpuid_cache = INIT_CACHE;
static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
static int length_to_accel = 64;

static void init_accel(unsigned cache)
{
    bool (*fn)(const void *, size_t) = buffer_zero_int;
    if (cache & CACHE_SSE2) {
        fn = buffer_zero_sse2;
        length_to_accel = 64;
    }
#ifdef CONFIG_AVX2_OPT
    if (cache & CACHE_SSE4) {
        fn = buffer_zero_sse4;
        length_to_accel = 64;
    }
    if (cache & CACHE_AVX2) {
        fn = buffer_zero_avx2;
        length_to_accel = 128;
    }
#endif
#ifdef CONFIG_AVX512F_OPT
    if (cache & CACHE_AVX512F) {
        fn = buffer_zero_avx512;
        length_to_accel = 256;
    }
#endif
    buffer_accel = fn;
}

#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
#include "qemu/cpuid.h"

static void __attribute__((constructor)) init_cpuid_cache(void)
{
    int max = __get_cpuid_max(0, NULL);
    int a, b, c, d;
    unsigned cache = 0;

    if (max >= 1) {
        __cpuid(1, a, b, c, d);
        if (d & bit_SSE2) {
            cache |= CACHE_SSE2;
        }
        if (c & bit_SSE4_1) {
            cache |= CACHE_SSE4;
        }

        /* We must check that AVX is not just available, but usable.  */
        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
            int bv;
            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
            __cpuid_count(7, 0, a, b, c, d);
            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
                cache |= CACHE_AVX2;
            }
            /* 0xe6:
            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
            *                    and ZMM16-ZMM31 state are enabled by OS)
            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
            */
            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
                cache |= CACHE_AVX512F;
            }
        }
    }
    cpuid_cache = cache;
    init_accel(cache);
}
#endif /* CONFIG_AVX2_OPT */

bool test_buffer_is_zero_next_accel(void)
{
    /* If no bits set, we just tested buffer_zero_int, and there
       are no more acceleration options to test.  */
    if (cpuid_cache == 0) {
        return false;
    }
    /* Disable the accelerator we used before and select a new one.  */
    cpuid_cache &= cpuid_cache - 1;
    init_accel(cpuid_cache);
    return true;
}

static bool select_accel_fn(const void *buf, size_t len)
{
    if (likely(len >= length_to_accel)) {
        return buffer_accel(buf, len);
    }
    return buffer_zero_int(buf, len);
}

#else
#define select_accel_fn  buffer_zero_int
bool test_buffer_is_zero_next_accel(void)
{
    return false;
}
#endif

/*
 * Checks if a buffer is all zeroes
 */
bool buffer_is_zero(const void *buf, size_t len)
{
    if (unlikely(len == 0)) {
        return true;
    }

    /* Fetch the beginning of the buffer while we select the accelerator.  */
    __builtin_prefetch(buf);

    /* Use an optimized zero check if possible.  Note that this also
       includes a check for an unrolled loop over 64-bit integers.  */
    return select_accel_fn(buf, len);
}
Commit	Line	Data
88ca8e80 RH	1	/*
	2	* Simple C functions to supplement the C library
	3	*
	4	* Copyright (c) 2006 Fabrice Bellard
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24	#include "qemu/osdep.h"
88ca8e80	25	#include "qemu/cutils.h"
5e33a872	26	#include "qemu/bswap.h"
88ca8e80	27
5e33a872 RH	28	static bool
	29	buffer_zero_int(const void *buf, size_t len)
	30	{
	31	if (unlikely(len < 8)) {
	32	/* For a very small buffer, simply accumulate all the bytes. */
	33	const unsigned char *p = buf;
	34	const unsigned char *e = buf + len;
	35	unsigned char t = 0;
	36
	37	do {
	38	t \|= *p++;
	39	} while (p < e);
	40
	41	return t == 0;
	42	} else {
	43	/* Otherwise, use the unaligned memory access functions to
	44	handle the beginning and end of the buffer, with a couple
	45	of loops handling the middle aligned section. */
	46	uint64_t t = ldq_he_p(buf);
	47	const uint64_t p = (uint64_t )(((uintptr_t)buf + 8) & -8);
	48	const uint64_t e = (uint64_t )(((uintptr_t)buf + len) & -8);
	49
	50	for (; p + 8 <= e; p += 8) {
	51	__builtin_prefetch(p + 8);
	52	if (t) {
	53	return false;
	54	}
	55	t = p[0] \| p[1] \| p[2] \| p[3] \| p[4] \| p[5] \| p[6] \| p[7];
	56	}
	57	while (p < e) {
	58	t \|= *p++;
	59	}
	60	t \|= ldq_he_p(buf + len - 8);
	61
	62	return t == 0;
	63	}
	64	}
	65
27f08ea1	66	#if defined(CONFIG_AVX512F_OPT) \|\| defined(CONFIG_AVX2_OPT) \|\| defined(__SSE2__)
5e33a872 RH	67	/* Do not use push_options pragmas unnecessarily, because clang
	68	* does not support them.
	69	*/
27f08ea1	70	#if defined(CONFIG_AVX512F_OPT) \|\| defined(CONFIG_AVX2_OPT)
5e33a872 RH	71	#pragma GCC push_options
	72	#pragma GCC target("sse2")
	73	#endif
	74	#include <emmintrin.h>
d9911d14 RH	75
	76	/* Note that each of these vectorized functions require len >= 64. */
	77
	78	static bool
	79	buffer_zero_sse2(const void *buf, size_t len)
	80	{
	81	__m128i t = _mm_loadu_si128(buf);
	82	__m128i p = (__m128i )(((uintptr_t)buf + 5 * 16) & -16);
	83	__m128i e = (__m128i )(((uintptr_t)buf + len) & -16);
	84	__m128i zero = _mm_setzero_si128();
	85
	86	/* Loop over 16-byte aligned blocks of 64. */
	87	while (likely(p <= e)) {
	88	__builtin_prefetch(p);
	89	t = _mm_cmpeq_epi8(t, zero);
	90	if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
	91	return false;
	92	}
	93	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	94	p += 4;
	95	}
	96
	97	/* Finish the aligned tail. */
	98	t \|= e[-3];
	99	t \|= e[-2];
	100	t \|= e[-1];
	101
	102	/* Finish the unaligned tail. */
	103	t \|= _mm_loadu_si128(buf + len - 16);
	104
	105	return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
	106	}
27f08ea1	107	#if defined(CONFIG_AVX512F_OPT) \|\| defined(CONFIG_AVX2_OPT)
5e33a872 RH	108	#pragma GCC pop_options
5e33a872 RH	109	#endif
88ca8e80	110
5e33a872	111	#ifdef CONFIG_AVX2_OPT
d9911d14 RH	112	/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
	113	* the includes have to be within the corresponding push_options region, and
	114	* therefore the regions themselves have to be ordered with increasing ISA.
	115	*/
86444f08 PB	116	#pragma GCC push_options
	117	#pragma GCC target("sse4")
	118	#include <smmintrin.h>
86444f08	119
d9911d14 RH	120	static bool
	121	buffer_zero_sse4(const void *buf, size_t len)
	122	{
	123	__m128i t = _mm_loadu_si128(buf);
	124	__m128i p = (__m128i )(((uintptr_t)buf + 5 * 16) & -16);
	125	__m128i e = (__m128i )(((uintptr_t)buf + len) & -16);
	126
	127	/* Loop over 16-byte aligned blocks of 64. */
	128	while (likely(p <= e)) {
	129	__builtin_prefetch(p);
	130	if (unlikely(!_mm_testz_si128(t, t))) {
	131	return false;
	132	}
	133	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	134	p += 4;
	135	}
	136
	137	/* Finish the aligned tail. */
	138	t \|= e[-3];
	139	t \|= e[-2];
	140	t \|= e[-1];
	141
	142	/* Finish the unaligned tail. */
	143	t \|= _mm_loadu_si128(buf + len - 16);
	144
	145	return _mm_testz_si128(t, t);
	146	}
	147
	148	#pragma GCC pop_options
88ca8e80 RH	149	#pragma GCC push_options
88ca8e80 RH	150	#pragma GCC target("avx2")
88ca8e80	151	#include <immintrin.h>
d9911d14 RH	152
	153	static bool
	154	buffer_zero_avx2(const void *buf, size_t len)
	155	{
	156	/* Begin with an unaligned head of 32 bytes. */
	157	__m256i t = _mm256_loadu_si256(buf);
	158	__m256i p = (__m256i )(((uintptr_t)buf + 5 * 32) & -32);
	159	__m256i e = (__m256i )(((uintptr_t)buf + len) & -32);
	160
8f13a39d RH	161	/* Loop over 32-byte aligned blocks of 128. */
	162	while (p <= e) {
	163	__builtin_prefetch(p);
	164	if (unlikely(!_mm256_testz_si256(t, t))) {
	165	return false;
d9911d14	166	}
8f13a39d RH	167	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	168	p += 4;
	169	} ;
d9911d14 RH	170
	171	/* Finish the last block of 128 unaligned. */
	172	t \|= _mm256_loadu_si256(buf + len - 4 * 32);
	173	t \|= _mm256_loadu_si256(buf + len - 3 * 32);
d9911d14 RH	174	t \|= _mm256_loadu_si256(buf + len - 2 * 32);
	175	t \|= _mm256_loadu_si256(buf + len - 1 * 32);
	176
	177	return _mm256_testz_si256(t, t);
	178	}
5e33a872	179	#pragma GCC pop_options
d9911d14 RH	180	#endif /* CONFIG_AVX2_OPT */
d9911d14 RH	181
27f08ea1 RH	182	#ifdef CONFIG_AVX512F_OPT
	183	#pragma GCC push_options
	184	#pragma GCC target("avx512f")
	185	#include <immintrin.h>
	186
	187	static bool
	188	buffer_zero_avx512(const void *buf, size_t len)
	189	{
	190	/* Begin with an unaligned head of 64 bytes. */
	191	__m512i t = _mm512_loadu_si512(buf);
	192	__m512i p = (__m512i )(((uintptr_t)buf + 5 * 64) & -64);
	193	__m512i e = (__m512i )(((uintptr_t)buf + len) & -64);
	194
	195	/* Loop over 64-byte aligned blocks of 256. */
	196	while (p <= e) {
	197	__builtin_prefetch(p);
	198	if (unlikely(_mm512_test_epi64_mask(t, t))) {
	199	return false;
	200	}
	201	t = p[-4] \| p[-3] \| p[-2] \| p[-1];
	202	p += 4;
	203	}
	204
	205	t \|= _mm512_loadu_si512(buf + len - 4 * 64);
	206	t \|= _mm512_loadu_si512(buf + len - 3 * 64);
	207	t \|= _mm512_loadu_si512(buf + len - 2 * 64);
	208	t \|= _mm512_loadu_si512(buf + len - 1 * 64);
	209
	210	return !_mm512_test_epi64_mask(t, t);
	211
	212	}
	213	#pragma GCC pop_options
	214	#endif
	215
	216
d9911d14 RH	217	/* Note that for test_buffer_is_zero_next_accel, the most preferred
	218	* ISA must have the least significant bit.
	219	*/
27f08ea1 RH	220	#define CACHE_AVX512F 1
	221	#define CACHE_AVX2 2
	222	#define CACHE_SSE4 4
	223	#define CACHE_SSE2 8
d9911d14 RH	224
	225	/* Make sure that these variables are appropriately initialized when
	226	* SSE2 is enabled on the compiler command-line, but the compiler is
5dd89908	227	* too old to support CONFIG_AVX2_OPT.
d9911d14	228	*/
27f08ea1	229	#if defined(CONFIG_AVX512F_OPT) \|\| defined(CONFIG_AVX2_OPT)
d9911d14 RH	230	# define INIT_CACHE 0
	231	# define INIT_ACCEL buffer_zero_int
	232	#else
	233	# ifndef __SSE2__
	234	# error "ISA selection confusion"
	235	# endif
	236	# define INIT_CACHE CACHE_SSE2
	237	# define INIT_ACCEL buffer_zero_sse2
5e33a872	238	#endif
88ca8e80	239
d9911d14 RH	240	static unsigned cpuid_cache = INIT_CACHE;
d9911d14 RH	241	static bool (buffer_accel)(const void , size_t) = INIT_ACCEL;
27f08ea1	242	static int length_to_accel = 64;
88ca8e80	243
d9911d14 RH	244	static void init_accel(unsigned cache)
	245	{
	246	bool (fn)(const void , size_t) = buffer_zero_int;
	247	if (cache & CACHE_SSE2) {
	248	fn = buffer_zero_sse2;
b87c99d0	249	length_to_accel = 64;
d9911d14 RH	250	}
	251	#ifdef CONFIG_AVX2_OPT
	252	if (cache & CACHE_SSE4) {
	253	fn = buffer_zero_sse4;
b87c99d0	254	length_to_accel = 64;
d9911d14 RH	255	}
	256	if (cache & CACHE_AVX2) {
	257	fn = buffer_zero_avx2;
8f13a39d	258	length_to_accel = 128;
d9911d14	259	}
27f08ea1 RH	260	#endif
	261	#ifdef CONFIG_AVX512F_OPT
	262	if (cache & CACHE_AVX512F) {
	263	fn = buffer_zero_avx512;
	264	length_to_accel = 256;
	265	}
d9911d14 RH	266	#endif
	267	buffer_accel = fn;
	268	}
88ca8e80	269
27f08ea1	270	#if defined(CONFIG_AVX512F_OPT) \|\| defined(CONFIG_AVX2_OPT)
5dd89908 RH	271	#include "qemu/cpuid.h"
5dd89908 RH	272
5e33a872	273	static void __attribute__((constructor)) init_cpuid_cache(void)
88ca8e80	274	{
5e33a872 RH	275	int max = __get_cpuid_max(0, NULL);
	276	int a, b, c, d;
	277	unsigned cache = 0;
88ca8e80	278
5e33a872 RH	279	if (max >= 1) {
	280	__cpuid(1, a, b, c, d);
	281	if (d & bit_SSE2) {
	282	cache \|= CACHE_SSE2;
	283	}
5e33a872 RH	284	if (c & bit_SSE4_1) {
5e33a872 RH	285	cache \|= CACHE_SSE4;
88ca8e80	286	}
88ca8e80	287
5e33a872	288	/* We must check that AVX is not just available, but usable. */
d9911d14 RH	289	if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
	290	int bv;
	291	__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
	292	__cpuid_count(7, 0, a, b, c, d);
27f08ea1	293	if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
d9911d14	294	cache \|= CACHE_AVX2;
5e33a872	295	}
27f08ea1 RH	296	/* 0xe6:
	297	* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
	298	* and ZMM16-ZMM31 state are enabled by OS)
	299	* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
	300	*/
	301	if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
	302	cache \|= CACHE_AVX512F;
	303	}
88ca8e80 RH	304	}
88ca8e80 RH	305	}
5e33a872	306	cpuid_cache = cache;
d9911d14	307	init_accel(cache);
88ca8e80	308	}
d9911d14	309	#endif /* CONFIG_AVX2_OPT */
88ca8e80	310
efad6682 RH	311	bool test_buffer_is_zero_next_accel(void)
	312	{
	313	/* If no bits set, we just tested buffer_zero_int, and there
	314	are no more acceleration options to test. */
	315	if (cpuid_cache == 0) {
	316	return false;
	317	}
	318	/* Disable the accelerator we used before and select a new one. */
	319	cpuid_cache &= cpuid_cache - 1;
d9911d14	320	init_accel(cpuid_cache);
efad6682 RH	321	return true;
	322	}
	323
5e33a872	324	static bool select_accel_fn(const void *buf, size_t len)
88ca8e80	325	{
27f08ea1	326	if (likely(len >= length_to_accel)) {
d9911d14	327	return buffer_accel(buf, len);
5e33a872 RH	328	}
5e33a872 RH	329	return buffer_zero_int(buf, len);
88ca8e80 RH	330	}
88ca8e80 RH	331
5e33a872 RH	332	#else
5e33a872 RH	333	#define select_accel_fn buffer_zero_int
efad6682 RH	334	bool test_buffer_is_zero_next_accel(void)
	335	{
	336	return false;
	337	}
	338	#endif
	339
88ca8e80 RH	340	/*
88ca8e80 RH	341	* Checks if a buffer is all zeroes
88ca8e80 RH	342	*/
	343	bool buffer_is_zero(const void *buf, size_t len)
	344	{
5e33a872 RH	345	if (unlikely(len == 0)) {
5e33a872 RH	346	return true;
88ca8e80 RH	347	}
88ca8e80 RH	348
083d012a RH	349	/* Fetch the beginning of the buffer while we select the accelerator. */
	350	__builtin_prefetch(buf);
	351
5e33a872 RH	352	/* Use an optimized zero check if possible. Note that this also
	353	includes a check for an unrolled loop over 64-bit integers. */
	354	return select_accel_fn(buf, len);
88ca8e80	355	}