]>
Commit | Line | Data |
---|---|---|
88ca8e80 RH |
1 | /* |
2 | * Simple C functions to supplement the C library | |
3 | * | |
4 | * Copyright (c) 2006 Fabrice Bellard | |
5 | * | |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 | * of this software and associated documentation files (the "Software"), to deal | |
8 | * in the Software without restriction, including without limitation the rights | |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 | * copies of the Software, and to permit persons to whom the Software is | |
11 | * furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in | |
14 | * all copies or substantial portions of the Software. | |
15 | * | |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 | * THE SOFTWARE. | |
23 | */ | |
24 | #include "qemu/osdep.h" | |
25 | #include "qemu-common.h" | |
26 | #include "qemu/cutils.h" | |
5e33a872 | 27 | #include "qemu/bswap.h" |
88ca8e80 RH |
28 | |
29 | ||
30 | /* vector definitions */ | |
5e33a872 RH |
31 | |
32 | extern void link_error(void); | |
33 | ||
34 | #define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO) \ | |
35 | static bool NAME(const void *buf, size_t len) \ | |
36 | { \ | |
37 | const void *end = buf + len; \ | |
38 | do { \ | |
39 | const VECTYPE *p = buf; \ | |
40 | VECTYPE t; \ | |
41 | if (SIZE == sizeof(VECTYPE) * 4) { \ | |
42 | t = (p[0] | p[1]) | (p[2] | p[3]); \ | |
43 | } else if (SIZE == sizeof(VECTYPE) * 8) { \ | |
44 | t = p[0] | p[1]; \ | |
45 | t |= p[2] | p[3]; \ | |
46 | t |= p[4] | p[5]; \ | |
47 | t |= p[6] | p[7]; \ | |
48 | } else { \ | |
49 | link_error(); \ | |
50 | } \ | |
51 | if (unlikely(NONZERO(t))) { \ | |
52 | return false; \ | |
53 | } \ | |
54 | buf += SIZE; \ | |
55 | } while (buf < end); \ | |
56 | return true; \ | |
57 | } | |
58 | ||
59 | static bool | |
60 | buffer_zero_int(const void *buf, size_t len) | |
61 | { | |
62 | if (unlikely(len < 8)) { | |
63 | /* For a very small buffer, simply accumulate all the bytes. */ | |
64 | const unsigned char *p = buf; | |
65 | const unsigned char *e = buf + len; | |
66 | unsigned char t = 0; | |
67 | ||
68 | do { | |
69 | t |= *p++; | |
70 | } while (p < e); | |
71 | ||
72 | return t == 0; | |
73 | } else { | |
74 | /* Otherwise, use the unaligned memory access functions to | |
75 | handle the beginning and end of the buffer, with a couple | |
76 | of loops handling the middle aligned section. */ | |
77 | uint64_t t = ldq_he_p(buf); | |
78 | const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); | |
79 | const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); | |
80 | ||
81 | for (; p + 8 <= e; p += 8) { | |
82 | __builtin_prefetch(p + 8); | |
83 | if (t) { | |
84 | return false; | |
85 | } | |
86 | t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | |
87 | } | |
88 | while (p < e) { | |
89 | t |= *p++; | |
90 | } | |
91 | t |= ldq_he_p(buf + len - 8); | |
92 | ||
93 | return t == 0; | |
94 | } | |
95 | } | |
96 | ||
43ff5e01 | 97 | #if defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__)) |
5e33a872 | 98 | #include <cpuid.h> |
88ca8e80 | 99 | |
5e33a872 RH |
100 | /* Do not use push_options pragmas unnecessarily, because clang |
101 | * does not support them. | |
102 | */ | |
103 | #ifndef __SSE2__ | |
104 | #pragma GCC push_options | |
105 | #pragma GCC target("sse2") | |
106 | #endif | |
107 | #include <emmintrin.h> | |
108 | #define SSE2_NONZERO(X) \ | |
109 | (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF) | |
110 | ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO) | |
111 | #ifndef __SSE2__ | |
112 | #pragma GCC pop_options | |
113 | #endif | |
88ca8e80 | 114 | |
5e33a872 | 115 | #ifdef CONFIG_AVX2_OPT |
88ca8e80 RH |
116 | #pragma GCC push_options |
117 | #pragma GCC target("avx2") | |
88ca8e80 | 118 | #include <immintrin.h> |
5e33a872 RH |
119 | #define AVX2_NONZERO(X) !_mm256_testz_si256((X), (X)) |
120 | ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO) | |
121 | #pragma GCC pop_options | |
122 | #endif | |
88ca8e80 | 123 | |
5e33a872 RH |
124 | #define CACHE_AVX2 2 |
125 | #define CACHE_AVX1 4 | |
126 | #define CACHE_SSE4 8 | |
127 | #define CACHE_SSE2 16 | |
88ca8e80 | 128 | |
5e33a872 | 129 | static unsigned cpuid_cache; |
88ca8e80 | 130 | |
5e33a872 | 131 | static void __attribute__((constructor)) init_cpuid_cache(void) |
88ca8e80 | 132 | { |
5e33a872 RH |
133 | int max = __get_cpuid_max(0, NULL); |
134 | int a, b, c, d; | |
135 | unsigned cache = 0; | |
88ca8e80 | 136 | |
5e33a872 RH |
137 | if (max >= 1) { |
138 | __cpuid(1, a, b, c, d); | |
139 | if (d & bit_SSE2) { | |
140 | cache |= CACHE_SSE2; | |
141 | } | |
142 | #ifdef CONFIG_AVX2_OPT | |
143 | if (c & bit_SSE4_1) { | |
144 | cache |= CACHE_SSE4; | |
88ca8e80 | 145 | } |
88ca8e80 | 146 | |
5e33a872 RH |
147 | /* We must check that AVX is not just available, but usable. */ |
148 | if ((c & bit_OSXSAVE) && (c & bit_AVX)) { | |
149 | __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0)); | |
150 | if ((a & 6) == 6) { | |
151 | cache |= CACHE_AVX1; | |
152 | if (max >= 7) { | |
153 | __cpuid_count(7, 0, a, b, c, d); | |
154 | if (b & bit_AVX2) { | |
155 | cache |= CACHE_AVX2; | |
156 | } | |
157 | } | |
158 | } | |
88ca8e80 | 159 | } |
5e33a872 | 160 | #endif |
88ca8e80 | 161 | } |
5e33a872 | 162 | cpuid_cache = cache; |
88ca8e80 RH |
163 | } |
164 | ||
5e33a872 | 165 | static bool select_accel_fn(const void *buf, size_t len) |
88ca8e80 | 166 | { |
5e33a872 RH |
167 | uintptr_t ibuf = (uintptr_t)buf; |
168 | #ifdef CONFIG_AVX2_OPT | |
169 | if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) { | |
170 | return buffer_zero_avx2(buf, len); | |
88ca8e80 | 171 | } |
5e33a872 RH |
172 | #endif |
173 | if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) { | |
174 | return buffer_zero_sse2(buf, len); | |
175 | } | |
176 | return buffer_zero_int(buf, len); | |
88ca8e80 RH |
177 | } |
178 | ||
5e33a872 RH |
179 | #else |
180 | #define select_accel_fn buffer_zero_int | |
88ca8e80 RH |
181 | #endif |
182 | ||
183 | /* | |
184 | * Checks if a buffer is all zeroes | |
88ca8e80 RH |
185 | */ |
186 | bool buffer_is_zero(const void *buf, size_t len) | |
187 | { | |
5e33a872 RH |
188 | if (unlikely(len == 0)) { |
189 | return true; | |
88ca8e80 RH |
190 | } |
191 | ||
5e33a872 RH |
192 | /* Use an optimized zero check if possible. Note that this also |
193 | includes a check for an unrolled loop over 64-bit integers. */ | |
194 | return select_accel_fn(buf, len); | |
88ca8e80 | 195 | } |