2 * Simple C functions to supplement the C library
4 * Copyright (c) 2006 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "qemu/osdep.h"
25 #include "qemu-common.h"
26 #include "qemu/cutils.h"
29 /* vector definitions */
32 /* The altivec.h header says we're allowed to undef these for
33 * C++ compatibility. Here we don't care about C++, but we
34 * undef them anyway to avoid namespace pollution.
39 #define VECTYPE __vector unsigned char
40 #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
41 #define VEC_OR(v1, v2) ((v1) | (v2))
42 /* altivec.h may redefine the bool macro as vector type.
43 * Reset it to POSIX semantics. */
45 #elif defined __SSE2__
46 #include <emmintrin.h>
47 #define VECTYPE __m128i
48 #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
49 #define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
50 #elif defined(__aarch64__)
52 #define VECTYPE uint64x2_t
53 #define ALL_EQ(v1, v2) \
54 ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
55 (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
56 #define VEC_OR(v1, v2) ((v1) | (v2))
58 #define VECTYPE unsigned long
59 #define ALL_EQ(v1, v2) ((v1) == (v2))
60 #define VEC_OR(v1, v2) ((v1) | (v2))
63 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
66 can_use_buffer_find_nonzero_offset_inner(const void *buf
, size_t len
)
68 return (len
% (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
69 * sizeof(VECTYPE
)) == 0
70 && ((uintptr_t) buf
) % sizeof(VECTYPE
) == 0);
74 * Searches for an area with non-zero content in a buffer
76 * Attention! The len must be a multiple of
77 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
78 * and addr must be a multiple of sizeof(VECTYPE) due to
79 * restriction of optimizations in this function.
81 * can_use_buffer_find_nonzero_offset_inner() can be used to
82 * check these requirements.
84 * The return value is the offset of the non-zero area rounded
85 * down to a multiple of sizeof(VECTYPE) for the first
86 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
87 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
90 * If the buffer is all zero the return value is equal to len.
93 static size_t buffer_find_nonzero_offset_inner(const void *buf
, size_t len
)
95 const VECTYPE
*p
= buf
;
96 const VECTYPE zero
= (VECTYPE
){0};
99 assert(can_use_buffer_find_nonzero_offset_inner(buf
, len
));
105 for (i
= 0; i
< BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
; i
++) {
106 if (!ALL_EQ(p
[i
], zero
)) {
107 return i
* sizeof(VECTYPE
);
111 for (i
= BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
;
112 i
< len
/ sizeof(VECTYPE
);
113 i
+= BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
) {
114 VECTYPE tmp0
= VEC_OR(p
[i
+ 0], p
[i
+ 1]);
115 VECTYPE tmp1
= VEC_OR(p
[i
+ 2], p
[i
+ 3]);
116 VECTYPE tmp2
= VEC_OR(p
[i
+ 4], p
[i
+ 5]);
117 VECTYPE tmp3
= VEC_OR(p
[i
+ 6], p
[i
+ 7]);
118 VECTYPE tmp01
= VEC_OR(tmp0
, tmp1
);
119 VECTYPE tmp23
= VEC_OR(tmp2
, tmp3
);
120 if (!ALL_EQ(VEC_OR(tmp01
, tmp23
), zero
)) {
125 return i
* sizeof(VECTYPE
);
128 #if defined CONFIG_AVX2_OPT
129 #pragma GCC push_options
130 #pragma GCC target("avx2")
132 #include <immintrin.h>
134 #define AVX2_VECTYPE __m256i
135 #define AVX2_ALL_EQ(v1, v2) \
136 (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
137 #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
140 can_use_buffer_find_nonzero_offset_avx2(const void *buf
, size_t len
)
142 return (len
% (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
143 * sizeof(AVX2_VECTYPE
)) == 0
144 && ((uintptr_t) buf
) % sizeof(AVX2_VECTYPE
) == 0);
147 static size_t buffer_find_nonzero_offset_avx2(const void *buf
, size_t len
)
149 const AVX2_VECTYPE
*p
= buf
;
150 const AVX2_VECTYPE zero
= (AVX2_VECTYPE
){0};
153 assert(can_use_buffer_find_nonzero_offset_avx2(buf
, len
));
159 for (i
= 0; i
< BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
; i
++) {
160 if (!AVX2_ALL_EQ(p
[i
], zero
)) {
161 return i
* sizeof(AVX2_VECTYPE
);
165 for (i
= BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
;
166 i
< len
/ sizeof(AVX2_VECTYPE
);
167 i
+= BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
) {
168 AVX2_VECTYPE tmp0
= AVX2_VEC_OR(p
[i
+ 0], p
[i
+ 1]);
169 AVX2_VECTYPE tmp1
= AVX2_VEC_OR(p
[i
+ 2], p
[i
+ 3]);
170 AVX2_VECTYPE tmp2
= AVX2_VEC_OR(p
[i
+ 4], p
[i
+ 5]);
171 AVX2_VECTYPE tmp3
= AVX2_VEC_OR(p
[i
+ 6], p
[i
+ 7]);
172 AVX2_VECTYPE tmp01
= AVX2_VEC_OR(tmp0
, tmp1
);
173 AVX2_VECTYPE tmp23
= AVX2_VEC_OR(tmp2
, tmp3
);
174 if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01
, tmp23
), zero
)) {
179 return i
* sizeof(AVX2_VECTYPE
);
182 static bool avx2_support(void)
186 if (__get_cpuid_max(0, NULL
) < 7) {
190 __cpuid_count(7, 0, a
, b
, c
, d
);
195 bool can_use_buffer_find_nonzero_offset(const void *buf
, size_t len
) \
196 __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
197 size_t buffer_find_nonzero_offset(const void *buf
, size_t len
) \
198 __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
200 static void *buffer_find_nonzero_offset_ifunc(void)
202 typeof(buffer_find_nonzero_offset
) *func
= (avx2_support()) ?
203 buffer_find_nonzero_offset_avx2
: buffer_find_nonzero_offset_inner
;
208 static void *can_use_buffer_find_nonzero_offset_ifunc(void)
210 typeof(can_use_buffer_find_nonzero_offset
) *func
= (avx2_support()) ?
211 can_use_buffer_find_nonzero_offset_avx2
:
212 can_use_buffer_find_nonzero_offset_inner
;
216 #pragma GCC pop_options
218 bool can_use_buffer_find_nonzero_offset(const void *buf
, size_t len
)
220 return can_use_buffer_find_nonzero_offset_inner(buf
, len
);
223 size_t buffer_find_nonzero_offset(const void *buf
, size_t len
)
225 return buffer_find_nonzero_offset_inner(buf
, len
);
230 * Checks if a buffer is all zeroes
232 * Attention! The len must be a multiple of 4 * sizeof(long) due to
233 * restriction of optimizations in this function.
235 bool buffer_is_zero(const void *buf
, size_t len
)
238 * Use long as the biggest available internal data type that fits into the
239 * CPU register and unroll the loop to smooth out the effect of memory
245 const long * const data
= buf
;
247 /* use vector optimized zero check if possible */
248 if (can_use_buffer_find_nonzero_offset(buf
, len
)) {
249 return buffer_find_nonzero_offset(buf
, len
) == len
;
252 assert(len
% (4 * sizeof(long)) == 0);
255 for (i
= 0; i
< len
; i
+= 4) {
261 if (d0
|| d1
|| d2
|| d3
) {