]>
Commit | Line | Data |
---|---|---|
88ca8e80 RH |
1 | /* |
2 | * Simple C functions to supplement the C library | |
3 | * | |
4 | * Copyright (c) 2006 Fabrice Bellard | |
5 | * | |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 | * of this software and associated documentation files (the "Software"), to deal | |
8 | * in the Software without restriction, including without limitation the rights | |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 | * copies of the Software, and to permit persons to whom the Software is | |
11 | * furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in | |
14 | * all copies or substantial portions of the Software. | |
15 | * | |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 | * THE SOFTWARE. | |
23 | */ | |
24 | #include "qemu/osdep.h" | |
88ca8e80 | 25 | #include "qemu/cutils.h" |
5e33a872 | 26 | #include "qemu/bswap.h" |
51f4d916 | 27 | #include "host/cpuinfo.h" |
88ca8e80 | 28 | |
5e33a872 RH |
29 | static bool |
30 | buffer_zero_int(const void *buf, size_t len) | |
31 | { | |
32 | if (unlikely(len < 8)) { | |
33 | /* For a very small buffer, simply accumulate all the bytes. */ | |
34 | const unsigned char *p = buf; | |
35 | const unsigned char *e = buf + len; | |
36 | unsigned char t = 0; | |
37 | ||
38 | do { | |
39 | t |= *p++; | |
40 | } while (p < e); | |
41 | ||
42 | return t == 0; | |
43 | } else { | |
44 | /* Otherwise, use the unaligned memory access functions to | |
45 | handle the beginning and end of the buffer, with a couple | |
46 | of loops handling the middle aligned section. */ | |
47 | uint64_t t = ldq_he_p(buf); | |
48 | const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); | |
49 | const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); | |
50 | ||
51 | for (; p + 8 <= e; p += 8) { | |
52 | __builtin_prefetch(p + 8); | |
53 | if (t) { | |
54 | return false; | |
55 | } | |
56 | t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | |
57 | } | |
58 | while (p < e) { | |
59 | t |= *p++; | |
60 | } | |
61 | t |= ldq_he_p(buf + len - 8); | |
62 | ||
63 | return t == 0; | |
64 | } | |
65 | } | |
66 | ||
27f08ea1 | 67 | #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) |
701ea587 | 68 | #include <immintrin.h> |
d9911d14 RH |
69 | |
70 | /* Note that each of these vectorized functions require len >= 64. */ | |
71 | ||
701ea587 | 72 | static bool __attribute__((target("sse2"))) |
d9911d14 RH |
73 | buffer_zero_sse2(const void *buf, size_t len) |
74 | { | |
75 | __m128i t = _mm_loadu_si128(buf); | |
76 | __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); | |
77 | __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); | |
78 | __m128i zero = _mm_setzero_si128(); | |
79 | ||
80 | /* Loop over 16-byte aligned blocks of 64. */ | |
81 | while (likely(p <= e)) { | |
82 | __builtin_prefetch(p); | |
83 | t = _mm_cmpeq_epi8(t, zero); | |
84 | if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { | |
85 | return false; | |
86 | } | |
87 | t = p[-4] | p[-3] | p[-2] | p[-1]; | |
88 | p += 4; | |
89 | } | |
90 | ||
91 | /* Finish the aligned tail. */ | |
92 | t |= e[-3]; | |
93 | t |= e[-2]; | |
94 | t |= e[-1]; | |
95 | ||
96 | /* Finish the unaligned tail. */ | |
97 | t |= _mm_loadu_si128(buf + len - 16); | |
98 | ||
99 | return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; | |
100 | } | |
88ca8e80 | 101 | |
5e33a872 | 102 | #ifdef CONFIG_AVX2_OPT |
701ea587 | 103 | static bool __attribute__((target("sse4"))) |
d9911d14 RH |
104 | buffer_zero_sse4(const void *buf, size_t len) |
105 | { | |
106 | __m128i t = _mm_loadu_si128(buf); | |
107 | __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); | |
108 | __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); | |
109 | ||
110 | /* Loop over 16-byte aligned blocks of 64. */ | |
111 | while (likely(p <= e)) { | |
112 | __builtin_prefetch(p); | |
113 | if (unlikely(!_mm_testz_si128(t, t))) { | |
114 | return false; | |
115 | } | |
116 | t = p[-4] | p[-3] | p[-2] | p[-1]; | |
117 | p += 4; | |
118 | } | |
119 | ||
120 | /* Finish the aligned tail. */ | |
121 | t |= e[-3]; | |
122 | t |= e[-2]; | |
123 | t |= e[-1]; | |
124 | ||
125 | /* Finish the unaligned tail. */ | |
126 | t |= _mm_loadu_si128(buf + len - 16); | |
127 | ||
128 | return _mm_testz_si128(t, t); | |
129 | } | |
130 | ||
701ea587 | 131 | static bool __attribute__((target("avx2"))) |
d9911d14 RH |
132 | buffer_zero_avx2(const void *buf, size_t len) |
133 | { | |
134 | /* Begin with an unaligned head of 32 bytes. */ | |
135 | __m256i t = _mm256_loadu_si256(buf); | |
136 | __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); | |
137 | __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); | |
138 | ||
8f13a39d RH |
139 | /* Loop over 32-byte aligned blocks of 128. */ |
140 | while (p <= e) { | |
141 | __builtin_prefetch(p); | |
142 | if (unlikely(!_mm256_testz_si256(t, t))) { | |
143 | return false; | |
d9911d14 | 144 | } |
8f13a39d RH |
145 | t = p[-4] | p[-3] | p[-2] | p[-1]; |
146 | p += 4; | |
147 | } ; | |
d9911d14 RH |
148 | |
149 | /* Finish the last block of 128 unaligned. */ | |
150 | t |= _mm256_loadu_si256(buf + len - 4 * 32); | |
151 | t |= _mm256_loadu_si256(buf + len - 3 * 32); | |
d9911d14 RH |
152 | t |= _mm256_loadu_si256(buf + len - 2 * 32); |
153 | t |= _mm256_loadu_si256(buf + len - 1 * 32); | |
154 | ||
155 | return _mm256_testz_si256(t, t); | |
156 | } | |
d9911d14 RH |
157 | #endif /* CONFIG_AVX2_OPT */ |
158 | ||
27f08ea1 | 159 | #ifdef CONFIG_AVX512F_OPT |
701ea587 | 160 | static bool __attribute__((target("avx512f"))) |
27f08ea1 RH |
161 | buffer_zero_avx512(const void *buf, size_t len) |
162 | { | |
163 | /* Begin with an unaligned head of 64 bytes. */ | |
164 | __m512i t = _mm512_loadu_si512(buf); | |
165 | __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); | |
166 | __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); | |
167 | ||
168 | /* Loop over 64-byte aligned blocks of 256. */ | |
169 | while (p <= e) { | |
170 | __builtin_prefetch(p); | |
171 | if (unlikely(_mm512_test_epi64_mask(t, t))) { | |
172 | return false; | |
173 | } | |
174 | t = p[-4] | p[-3] | p[-2] | p[-1]; | |
175 | p += 4; | |
176 | } | |
177 | ||
178 | t |= _mm512_loadu_si512(buf + len - 4 * 64); | |
179 | t |= _mm512_loadu_si512(buf + len - 3 * 64); | |
180 | t |= _mm512_loadu_si512(buf + len - 2 * 64); | |
181 | t |= _mm512_loadu_si512(buf + len - 1 * 64); | |
182 | ||
183 | return !_mm512_test_epi64_mask(t, t); | |
184 | ||
185 | } | |
701ea587 | 186 | #endif /* CONFIG_AVX512F_OPT */ |
27f08ea1 | 187 | |
51f4d916 RH |
188 | /* |
189 | * Make sure that these variables are appropriately initialized when | |
d9911d14 | 190 | * SSE2 is enabled on the compiler command-line, but the compiler is |
5dd89908 | 191 | * too old to support CONFIG_AVX2_OPT. |
d9911d14 | 192 | */ |
27f08ea1 | 193 | #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) |
51f4d916 RH |
194 | # define INIT_USED 0 |
195 | # define INIT_LENGTH 0 | |
196 | # define INIT_ACCEL buffer_zero_int | |
d9911d14 RH |
197 | #else |
198 | # ifndef __SSE2__ | |
199 | # error "ISA selection confusion" | |
200 | # endif | |
51f4d916 RH |
201 | # define INIT_USED CPUINFO_SSE2 |
202 | # define INIT_LENGTH 64 | |
203 | # define INIT_ACCEL buffer_zero_sse2 | |
5e33a872 | 204 | #endif |
88ca8e80 | 205 | |
51f4d916 RH |
206 | static unsigned used_accel = INIT_USED; |
207 | static unsigned length_to_accel = INIT_LENGTH; | |
d9911d14 | 208 | static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL; |
88ca8e80 | 209 | |
51f4d916 RH |
210 | static unsigned __attribute__((noinline)) |
211 | select_accel_cpuinfo(unsigned info) | |
d9911d14 | 212 | { |
51f4d916 RH |
213 | /* Array is sorted in order of algorithm preference. */ |
214 | static const struct { | |
215 | unsigned bit; | |
216 | unsigned len; | |
217 | bool (*fn)(const void *, size_t); | |
218 | } all[] = { | |
219 | #ifdef CONFIG_AVX512F_OPT | |
220 | { CPUINFO_AVX512F, 256, buffer_zero_avx512 }, | |
221 | #endif | |
d9911d14 | 222 | #ifdef CONFIG_AVX2_OPT |
51f4d916 RH |
223 | { CPUINFO_AVX2, 128, buffer_zero_avx2 }, |
224 | { CPUINFO_SSE4, 64, buffer_zero_sse4 }, | |
27f08ea1 | 225 | #endif |
51f4d916 RH |
226 | { CPUINFO_SSE2, 64, buffer_zero_sse2 }, |
227 | { CPUINFO_ALWAYS, 0, buffer_zero_int }, | |
228 | }; | |
229 | ||
230 | for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { | |
231 | if (info & all[i].bit) { | |
232 | length_to_accel = all[i].len; | |
233 | buffer_accel = all[i].fn; | |
234 | return all[i].bit; | |
235 | } | |
27f08ea1 | 236 | } |
51f4d916 | 237 | return 0; |
d9911d14 | 238 | } |
88ca8e80 | 239 | |
27f08ea1 | 240 | #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) |
51f4d916 | 241 | static void __attribute__((constructor)) init_accel(void) |
88ca8e80 | 242 | { |
51f4d916 | 243 | used_accel = select_accel_cpuinfo(cpuinfo_init()); |
88ca8e80 | 244 | } |
d9911d14 | 245 | #endif /* CONFIG_AVX2_OPT */ |
88ca8e80 | 246 | |
efad6682 RH |
247 | bool test_buffer_is_zero_next_accel(void) |
248 | { | |
51f4d916 RH |
249 | /* |
250 | * Accumulate the accelerators that we've already tested, and | |
251 | * remove them from the set to test this round. We'll get back | |
252 | * a zero from select_accel_cpuinfo when there are no more. | |
253 | */ | |
254 | unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); | |
255 | used_accel |= used; | |
256 | return used; | |
efad6682 RH |
257 | } |
258 | ||
5e33a872 | 259 | static bool select_accel_fn(const void *buf, size_t len) |
88ca8e80 | 260 | { |
27f08ea1 | 261 | if (likely(len >= length_to_accel)) { |
d9911d14 | 262 | return buffer_accel(buf, len); |
5e33a872 RH |
263 | } |
264 | return buffer_zero_int(buf, len); | |
88ca8e80 RH |
265 | } |
266 | ||
5e33a872 RH |
267 | #else |
268 | #define select_accel_fn buffer_zero_int | |
efad6682 RH |
269 | bool test_buffer_is_zero_next_accel(void) |
270 | { | |
271 | return false; | |
272 | } | |
273 | #endif | |
274 | ||
88ca8e80 RH |
275 | /* |
276 | * Checks if a buffer is all zeroes | |
88ca8e80 RH |
277 | */ |
278 | bool buffer_is_zero(const void *buf, size_t len) | |
279 | { | |
5e33a872 RH |
280 | if (unlikely(len == 0)) { |
281 | return true; | |
88ca8e80 RH |
282 | } |
283 | ||
083d012a RH |
284 | /* Fetch the beginning of the buffer while we select the accelerator. */ |
285 | __builtin_prefetch(buf); | |
286 | ||
5e33a872 RH |
287 | /* Use an optimized zero check if possible. Note that this also |
288 | includes a check for an unrolled loop over 64-bit integers. */ | |
289 | return select_accel_fn(buf, len); | |
88ca8e80 | 290 | } |