]> git.proxmox.com Git - mirror_qemu.git/blob - util/bufferiszero.c
cutils: Remove SPLAT macro
[mirror_qemu.git] / util / bufferiszero.c
1 /*
2 * Simple C functions to supplement the C library
3 *
4 * Copyright (c) 2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "qemu/osdep.h"
25 #include "qemu-common.h"
26 #include "qemu/cutils.h"
27
28
29 /* vector definitions */
30 #ifdef __ALTIVEC__
31 #include <altivec.h>
32 /* The altivec.h header says we're allowed to undef these for
33 * C++ compatibility. Here we don't care about C++, but we
34 * undef them anyway to avoid namespace pollution.
35 */
36 #undef vector
37 #undef pixel
38 #undef bool
39 #define VECTYPE __vector unsigned char
40 #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
41 #define VEC_OR(v1, v2) ((v1) | (v2))
42 /* altivec.h may redefine the bool macro as vector type.
43 * Reset it to POSIX semantics. */
44 #define bool _Bool
45 #elif defined __SSE2__
46 #include <emmintrin.h>
47 #define VECTYPE __m128i
48 #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
49 #define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
50 #elif defined(__aarch64__)
51 #include "arm_neon.h"
52 #define VECTYPE uint64x2_t
53 #define ALL_EQ(v1, v2) \
54 ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
55 (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
56 #define VEC_OR(v1, v2) ((v1) | (v2))
57 #else
58 #define VECTYPE unsigned long
59 #define ALL_EQ(v1, v2) ((v1) == (v2))
60 #define VEC_OR(v1, v2) ((v1) | (v2))
61 #endif
62
63 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
64
65 static bool
66 can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
67 {
68 return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
69 * sizeof(VECTYPE)) == 0
70 && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
71 }
72
73 /*
74 * Searches for an area with non-zero content in a buffer
75 *
76 * Attention! The len must be a multiple of
77 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
78 * and addr must be a multiple of sizeof(VECTYPE) due to
79 * restriction of optimizations in this function.
80 *
81 * can_use_buffer_find_nonzero_offset_inner() can be used to
82 * check these requirements.
83 *
84 * The return value is the offset of the non-zero area rounded
85 * down to a multiple of sizeof(VECTYPE) for the first
86 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
87 * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
88 * afterwards.
89 *
90 * If the buffer is all zero the return value is equal to len.
91 */
92
93 static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
94 {
95 const VECTYPE *p = buf;
96 const VECTYPE zero = (VECTYPE){0};
97 size_t i;
98
99 assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
100
101 if (!len) {
102 return 0;
103 }
104
105 for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
106 if (!ALL_EQ(p[i], zero)) {
107 return i * sizeof(VECTYPE);
108 }
109 }
110
111 for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
112 i < len / sizeof(VECTYPE);
113 i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
114 VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
115 VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
116 VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
117 VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
118 VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
119 VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
120 if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
121 break;
122 }
123 }
124
125 return i * sizeof(VECTYPE);
126 }
127
128 #if defined CONFIG_AVX2_OPT
129 #pragma GCC push_options
130 #pragma GCC target("avx2")
131 #include <cpuid.h>
132 #include <immintrin.h>
133
134 #define AVX2_VECTYPE __m256i
135 #define AVX2_ALL_EQ(v1, v2) \
136 (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
137 #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
138
139 static bool
140 can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
141 {
142 return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
143 * sizeof(AVX2_VECTYPE)) == 0
144 && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
145 }
146
147 static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
148 {
149 const AVX2_VECTYPE *p = buf;
150 const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
151 size_t i;
152
153 assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
154
155 if (!len) {
156 return 0;
157 }
158
159 for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
160 if (!AVX2_ALL_EQ(p[i], zero)) {
161 return i * sizeof(AVX2_VECTYPE);
162 }
163 }
164
165 for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
166 i < len / sizeof(AVX2_VECTYPE);
167 i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
168 AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
169 AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
170 AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
171 AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
172 AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
173 AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
174 if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
175 break;
176 }
177 }
178
179 return i * sizeof(AVX2_VECTYPE);
180 }
181
182 static bool avx2_support(void)
183 {
184 int a, b, c, d;
185
186 if (__get_cpuid_max(0, NULL) < 7) {
187 return false;
188 }
189
190 __cpuid_count(7, 0, a, b, c, d);
191
192 return b & bit_AVX2;
193 }
194
195 bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
196 __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
197 size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
198 __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
199
200 static void *buffer_find_nonzero_offset_ifunc(void)
201 {
202 typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
203 buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
204
205 return func;
206 }
207
208 static void *can_use_buffer_find_nonzero_offset_ifunc(void)
209 {
210 typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
211 can_use_buffer_find_nonzero_offset_avx2 :
212 can_use_buffer_find_nonzero_offset_inner;
213
214 return func;
215 }
216 #pragma GCC pop_options
217 #else
218 bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
219 {
220 return can_use_buffer_find_nonzero_offset_inner(buf, len);
221 }
222
223 size_t buffer_find_nonzero_offset(const void *buf, size_t len)
224 {
225 return buffer_find_nonzero_offset_inner(buf, len);
226 }
227 #endif
228
229 /*
230 * Checks if a buffer is all zeroes
231 *
232 * Attention! The len must be a multiple of 4 * sizeof(long) due to
233 * restriction of optimizations in this function.
234 */
235 bool buffer_is_zero(const void *buf, size_t len)
236 {
237 /*
238 * Use long as the biggest available internal data type that fits into the
239 * CPU register and unroll the loop to smooth out the effect of memory
240 * latency.
241 */
242
243 size_t i;
244 long d0, d1, d2, d3;
245 const long * const data = buf;
246
247 /* use vector optimized zero check if possible */
248 if (can_use_buffer_find_nonzero_offset(buf, len)) {
249 return buffer_find_nonzero_offset(buf, len) == len;
250 }
251
252 assert(len % (4 * sizeof(long)) == 0);
253 len /= sizeof(long);
254
255 for (i = 0; i < len; i += 4) {
256 d0 = data[i + 0];
257 d1 = data[i + 1];
258 d2 = data[i + 2];
259 d3 = data[i + 3];
260
261 if (d0 || d1 || d2 || d3) {
262 return false;
263 }
264 }
265
266 return true;
267 }
268