]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/bpacking_simd256_generated.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / bpacking_simd256_generated.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // Automatically generated file; DO NOT EDIT.
19
20 #pragma once
21
22 #include <cstdint>
23 #include <cstring>
24
25 #include <xsimd/xsimd.hpp>
26
27 #include "arrow/util/dispatch.h"
28 #include "arrow/util/ubsan.h"
29
30 namespace arrow {
31 namespace internal {
32 namespace {
33
34 using ::arrow::util::SafeLoad;
35
36 template <DispatchLevel level>
37 struct UnpackBits256 {
38
39 using simd_arch = xsimd::avx2;
40 using simd_batch = xsimd::batch<uint32_t, simd_arch>;
41
42 inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) {
43 memset(out, 0x0, 32 * sizeof(*out));
44 out += 32;
45
46 return in;
47 }
48
49 inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
50 uint32_t mask = 0x1;
51
52 simd_batch masks(mask);
53 simd_batch words, shifts;
54 simd_batch results;
55
56 // extract 1-bit bundles 0 to 7
57 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
58 shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 };
59 results = (words >> shifts) & masks;
60 results.store_unaligned(out);
61 out += 8;
62
63 // extract 1-bit bundles 8 to 15
64 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
65 shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 };
66 results = (words >> shifts) & masks;
67 results.store_unaligned(out);
68 out += 8;
69
70 // extract 1-bit bundles 16 to 23
71 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
72 shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 };
73 results = (words >> shifts) & masks;
74 results.store_unaligned(out);
75 out += 8;
76
77 // extract 1-bit bundles 24 to 31
78 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
79 shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 };
80 results = (words >> shifts) & masks;
81 results.store_unaligned(out);
82 out += 8;
83
84 in += 1;
85 return in;
86 }
87
88 inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
89 uint32_t mask = 0x3;
90
91 simd_batch masks(mask);
92 simd_batch words, shifts;
93 simd_batch results;
94
95 // extract 2-bit bundles 0 to 7
96 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
97 shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 };
98 results = (words >> shifts) & masks;
99 results.store_unaligned(out);
100 out += 8;
101
102 // extract 2-bit bundles 8 to 15
103 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
104 shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 };
105 results = (words >> shifts) & masks;
106 results.store_unaligned(out);
107 out += 8;
108
109 // extract 2-bit bundles 16 to 23
110 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
111 shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 };
112 results = (words >> shifts) & masks;
113 results.store_unaligned(out);
114 out += 8;
115
116 // extract 2-bit bundles 24 to 31
117 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
118 shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 };
119 results = (words >> shifts) & masks;
120 results.store_unaligned(out);
121 out += 8;
122
123 in += 2;
124 return in;
125 }
126
127 inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
128 uint32_t mask = 0x7;
129
130 simd_batch masks(mask);
131 simd_batch words, shifts;
132 simd_batch results;
133
134 // extract 3-bit bundles 0 to 7
135 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
136 shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21 };
137 results = (words >> shifts) & masks;
138 results.store_unaligned(out);
139 out += 8;
140
141 // extract 3-bit bundles 8 to 15
142 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
143 shifts = simd_batch{ 24, 27, 0, 1, 4, 7, 10, 13 };
144 results = (words >> shifts) & masks;
145 results.store_unaligned(out);
146 out += 8;
147
148 // extract 3-bit bundles 16 to 23
149 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
150 shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5 };
151 results = (words >> shifts) & masks;
152 results.store_unaligned(out);
153 out += 8;
154
155 // extract 3-bit bundles 24 to 31
156 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
157 shifts = simd_batch{ 8, 11, 14, 17, 20, 23, 26, 29 };
158 results = (words >> shifts) & masks;
159 results.store_unaligned(out);
160 out += 8;
161
162 in += 3;
163 return in;
164 }
165
166 inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
167 uint32_t mask = 0xf;
168
169 simd_batch masks(mask);
170 simd_batch words, shifts;
171 simd_batch results;
172
173 // extract 4-bit bundles 0 to 7
174 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
175 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 };
176 results = (words >> shifts) & masks;
177 results.store_unaligned(out);
178 out += 8;
179
180 // extract 4-bit bundles 8 to 15
181 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
182 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 };
183 results = (words >> shifts) & masks;
184 results.store_unaligned(out);
185 out += 8;
186
187 // extract 4-bit bundles 16 to 23
188 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
189 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 };
190 results = (words >> shifts) & masks;
191 results.store_unaligned(out);
192 out += 8;
193
194 // extract 4-bit bundles 24 to 31
195 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
196 shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 };
197 results = (words >> shifts) & masks;
198 results.store_unaligned(out);
199 out += 8;
200
201 in += 4;
202 return in;
203 }
204
205 inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
206 uint32_t mask = 0x1f;
207
208 simd_batch masks(mask);
209 simd_batch words, shifts;
210 simd_batch results;
211
212 // extract 5-bit bundles 0 to 7
213 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) };
214 shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3 };
215 results = (words >> shifts) & masks;
216 results.store_unaligned(out);
217 out += 8;
218
219 // extract 5-bit bundles 8 to 15
220 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
221 shifts = simd_batch{ 8, 13, 18, 23, 0, 1, 6, 11 };
222 results = (words >> shifts) & masks;
223 results.store_unaligned(out);
224 out += 8;
225
226 // extract 5-bit bundles 16 to 23
227 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
228 shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19 };
229 results = (words >> shifts) & masks;
230 results.store_unaligned(out);
231 out += 8;
232
233 // extract 5-bit bundles 24 to 31
234 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
235 shifts = simd_batch{ 24, 0, 2, 7, 12, 17, 22, 27 };
236 results = (words >> shifts) & masks;
237 results.store_unaligned(out);
238 out += 8;
239
240 in += 5;
241 return in;
242 }
243
244 inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
245 uint32_t mask = 0x3f;
246
247 simd_batch masks(mask);
248 simd_batch words, shifts;
249 simd_batch results;
250
251 // extract 6-bit bundles 0 to 7
252 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
253 shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 };
254 results = (words >> shifts) & masks;
255 results.store_unaligned(out);
256 out += 8;
257
258 // extract 6-bit bundles 8 to 15
259 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
260 shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 };
261 results = (words >> shifts) & masks;
262 results.store_unaligned(out);
263 out += 8;
264
265 // extract 6-bit bundles 16 to 23
266 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
267 shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 };
268 results = (words >> shifts) & masks;
269 results.store_unaligned(out);
270 out += 8;
271
272 // extract 6-bit bundles 24 to 31
273 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
274 shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 };
275 results = (words >> shifts) & masks;
276 results.store_unaligned(out);
277 out += 8;
278
279 in += 6;
280 return in;
281 }
282
283 inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
284 uint32_t mask = 0x7f;
285
286 simd_batch masks(mask);
287 simd_batch words, shifts;
288 simd_batch results;
289
290 // extract 7-bit bundles 0 to 7
291 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
292 shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17 };
293 results = (words >> shifts) & masks;
294 results.store_unaligned(out);
295 out += 8;
296
297 // extract 7-bit bundles 8 to 15
298 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
299 shifts = simd_batch{ 24, 0, 6, 13, 20, 0, 2, 9 };
300 results = (words >> shifts) & masks;
301 results.store_unaligned(out);
302 out += 8;
303
304 // extract 7-bit bundles 16 to 23
305 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) };
306 shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1 };
307 results = (words >> shifts) & masks;
308 results.store_unaligned(out);
309 out += 8;
310
311 // extract 7-bit bundles 24 to 31
312 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
313 shifts = simd_batch{ 8, 15, 22, 0, 4, 11, 18, 25 };
314 results = (words >> shifts) & masks;
315 results.store_unaligned(out);
316 out += 8;
317
318 in += 7;
319 return in;
320 }
321
322 inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
323 uint32_t mask = 0xff;
324
325 simd_batch masks(mask);
326 simd_batch words, shifts;
327 simd_batch results;
328
329 // extract 8-bit bundles 0 to 7
330 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
331 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 };
332 results = (words >> shifts) & masks;
333 results.store_unaligned(out);
334 out += 8;
335
336 // extract 8-bit bundles 8 to 15
337 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
338 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 };
339 results = (words >> shifts) & masks;
340 results.store_unaligned(out);
341 out += 8;
342
343 // extract 8-bit bundles 16 to 23
344 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
345 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 };
346 results = (words >> shifts) & masks;
347 results.store_unaligned(out);
348 out += 8;
349
350 // extract 8-bit bundles 24 to 31
351 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
352 shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 };
353 results = (words >> shifts) & masks;
354 results.store_unaligned(out);
355 out += 8;
356
357 in += 8;
358 return in;
359 }
360
361 inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
362 uint32_t mask = 0x1ff;
363
364 simd_batch masks(mask);
365 simd_batch words, shifts;
366 simd_batch results;
367
368 // extract 9-bit bundles 0 to 7
369 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1 };
370 shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0 };
371 results = (words >> shifts) & masks;
372 results.store_unaligned(out);
373 out += 8;
374
375 // extract 9-bit bundles 8 to 15
376 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) };
377 shifts = simd_batch{ 8, 17, 0, 3, 12, 21, 0, 7 };
378 results = (words >> shifts) & masks;
379 results.store_unaligned(out);
380 out += 8;
381
382 // extract 9-bit bundles 16 to 23
383 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
384 shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15 };
385 results = (words >> shifts) & masks;
386 results.store_unaligned(out);
387 out += 8;
388
389 // extract 9-bit bundles 24 to 31
390 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
391 shifts = simd_batch{ 0, 1, 10, 19, 0, 5, 14, 23 };
392 results = (words >> shifts) & masks;
393 results.store_unaligned(out);
394 out += 8;
395
396 in += 9;
397 return in;
398 }
399
400 inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
401 uint32_t mask = 0x3ff;
402
403 simd_batch masks(mask);
404 simd_batch words, shifts;
405 simd_batch results;
406
407 // extract 10-bit bundles 0 to 7
408 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) };
409 shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 };
410 results = (words >> shifts) & masks;
411 results.store_unaligned(out);
412 out += 8;
413
414 // extract 10-bit bundles 8 to 15
415 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
416 shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 };
417 results = (words >> shifts) & masks;
418 results.store_unaligned(out);
419 out += 8;
420
421 // extract 10-bit bundles 16 to 23
422 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) };
423 shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 };
424 results = (words >> shifts) & masks;
425 results.store_unaligned(out);
426 out += 8;
427
428 // extract 10-bit bundles 24 to 31
429 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) };
430 shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 };
431 results = (words >> shifts) & masks;
432 results.store_unaligned(out);
433 out += 8;
434
435 in += 10;
436 return in;
437 }
438
439 inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
440 uint32_t mask = 0x7ff;
441
442 simd_batch masks(mask);
443 simd_batch words, shifts;
444 simd_batch results;
445
446 // extract 11-bit bundles 0 to 7
447 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 23 | SafeLoad<uint32_t>(in + 2) << 9, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
448 shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 };
449 results = (words >> shifts) & masks;
450 results.store_unaligned(out);
451 out += 8;
452
453 // extract 11-bit bundles 8 to 15
454 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 25 | SafeLoad<uint32_t>(in + 4) << 7, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) };
455 shifts = simd_batch{ 0, 3, 14, 0, 4, 15, 0, 5 };
456 results = (words >> shifts) & masks;
457 results.store_unaligned(out);
458 out += 8;
459
460 // extract 11-bit bundles 16 to 23
461 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3 };
462 shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0 };
463 results = (words >> shifts) & masks;
464 results.store_unaligned(out);
465 out += 8;
466
467 // extract 11-bit bundles 24 to 31
468 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) };
469 shifts = simd_batch{ 8, 19, 0, 9, 20, 0, 10, 21 };
470 results = (words >> shifts) & masks;
471 results.store_unaligned(out);
472 out += 8;
473
474 in += 11;
475 return in;
476 }
477
478 inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
479 uint32_t mask = 0xfff;
480
481 simd_batch masks(mask);
482 simd_batch words, shifts;
483 simd_batch results;
484
485 // extract 12-bit bundles 0 to 7
486 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
487 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 };
488 results = (words >> shifts) & masks;
489 results.store_unaligned(out);
490 out += 8;
491
492 // extract 12-bit bundles 8 to 15
493 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
494 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 };
495 results = (words >> shifts) & masks;
496 results.store_unaligned(out);
497 out += 8;
498
499 // extract 12-bit bundles 16 to 23
500 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
501 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 };
502 results = (words >> shifts) & masks;
503 results.store_unaligned(out);
504 out += 8;
505
506 // extract 12-bit bundles 24 to 31
507 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) };
508 shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 };
509 results = (words >> shifts) & masks;
510 results.store_unaligned(out);
511 out += 8;
512
513 in += 12;
514 return in;
515 }
516
517 inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
518 uint32_t mask = 0x1fff;
519
520 simd_batch masks(mask);
521 simd_batch words, shifts;
522 simd_batch results;
523
524 // extract 13-bit bundles 0 to 7
525 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5 };
526 shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0 };
527 results = (words >> shifts) & masks;
528 results.store_unaligned(out);
529 out += 8;
530
531 // extract 13-bit bundles 8 to 15
532 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 21 | SafeLoad<uint32_t>(in + 4) << 11, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6) };
533 shifts = simd_batch{ 8, 0, 2, 15, 0, 9, 0, 3 };
534 results = (words >> shifts) & masks;
535 results.store_unaligned(out);
536 out += 8;
537
538 // extract 13-bit bundles 16 to 23
539 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) };
540 shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11 };
541 results = (words >> shifts) & masks;
542 results.store_unaligned(out);
543 out += 8;
544
545 // extract 13-bit bundles 24 to 31
546 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) };
547 shifts = simd_batch{ 0, 5, 18, 0, 12, 0, 6, 19 };
548 results = (words >> shifts) & masks;
549 results.store_unaligned(out);
550 out += 8;
551
552 in += 13;
553 return in;
554 }
555
556 inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
557 uint32_t mask = 0x3fff;
558
559 simd_batch masks(mask);
560 simd_batch words, shifts;
561 simd_batch results;
562
563 // extract 14-bit bundles 0 to 7
564 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3) };
565 shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 };
566 results = (words >> shifts) & masks;
567 results.store_unaligned(out);
568 out += 8;
569
570 // extract 14-bit bundles 8 to 15
571 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
572 shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 };
573 results = (words >> shifts) & masks;
574 results.store_unaligned(out);
575 out += 8;
576
577 // extract 14-bit bundles 16 to 23
578 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) };
579 shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 };
580 results = (words >> shifts) & masks;
581 results.store_unaligned(out);
582 out += 8;
583
584 // extract 14-bit bundles 24 to 31
585 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) };
586 shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 };
587 results = (words >> shifts) & masks;
588 results.store_unaligned(out);
589 out += 8;
590
591 in += 14;
592 return in;
593 }
594
595 inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
596 uint32_t mask = 0x7fff;
597
598 simd_batch masks(mask);
599 simd_batch words, shifts;
600 simd_batch results;
601
602 // extract 15-bit bundles 0 to 7
603 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3) };
604 shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9 };
605 results = (words >> shifts) & masks;
606 results.store_unaligned(out);
607 out += 8;
608
609 // extract 15-bit bundles 8 to 15
610 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7) };
611 shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 };
612 results = (words >> shifts) & masks;
613 results.store_unaligned(out);
614 out += 8;
615
616 // extract 15-bit bundles 16 to 23
617 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7 };
618 shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0 };
619 results = (words >> shifts) & masks;
620 results.store_unaligned(out);
621 out += 8;
622
623 // extract 15-bit bundles 24 to 31
624 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) };
625 shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 17 };
626 results = (words >> shifts) & masks;
627 results.store_unaligned(out);
628 out += 8;
629
630 in += 15;
631 return in;
632 }
633
634 inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
635 uint32_t mask = 0xffff;
636
637 simd_batch masks(mask);
638 simd_batch words, shifts;
639 simd_batch results;
640
641 // extract 16-bit bundles 0 to 7
642 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
643 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 };
644 results = (words >> shifts) & masks;
645 results.store_unaligned(out);
646 out += 8;
647
648 // extract 16-bit bundles 8 to 15
649 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
650 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 };
651 results = (words >> shifts) & masks;
652 results.store_unaligned(out);
653 out += 8;
654
655 // extract 16-bit bundles 16 to 23
656 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) };
657 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 };
658 results = (words >> shifts) & masks;
659 results.store_unaligned(out);
660 out += 8;
661
662 // extract 16-bit bundles 24 to 31
663 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) };
664 shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 };
665 results = (words >> shifts) & masks;
666 results.store_unaligned(out);
667 out += 8;
668
669 in += 16;
670 return in;
671 }
672
673 inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
674 uint32_t mask = 0x1ffff;
675
676 simd_batch masks(mask);
677 simd_batch words, shifts;
678 simd_batch results;
679
680 // extract 17-bit bundles 0 to 7
681 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 17 | SafeLoad<uint32_t>(in + 1) << 15, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 19 | SafeLoad<uint32_t>(in + 2) << 13, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 21 | SafeLoad<uint32_t>(in + 3) << 11, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 23 | SafeLoad<uint32_t>(in + 4) << 9 };
682 shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 };
683 results = (words >> shifts) & masks;
684 results.store_unaligned(out);
685 out += 8;
686
687 // extract 17-bit bundles 8 to 15
688 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1 };
689 shifts = simd_batch{ 8, 0, 10, 0, 12, 0, 14, 0 };
690 results = (words >> shifts) & masks;
691 results.store_unaligned(out);
692 out += 8;
693
694 // extract 17-bit bundles 16 to 23
695 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 16 | SafeLoad<uint32_t>(in + 9) << 16, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12) };
696 shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 };
697 results = (words >> shifts) & masks;
698 results.store_unaligned(out);
699 out += 8;
700
701 // extract 17-bit bundles 24 to 31
702 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) };
703 shifts = simd_batch{ 0, 9, 0, 11, 0, 13, 0, 15 };
704 results = (words >> shifts) & masks;
705 results.store_unaligned(out);
706 out += 8;
707
708 in += 17;
709 return in;
710 }
711
712 inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
713 uint32_t mask = 0x3ffff;
714
715 simd_batch masks(mask);
716 simd_batch words, shifts;
717 simd_batch results;
718
719 // extract 18-bit bundles 0 to 7
720 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 18 | SafeLoad<uint32_t>(in + 1) << 14, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2 };
721 shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 };
722 results = (words >> shifts) & masks;
723 results.store_unaligned(out);
724 out += 8;
725
726 // extract 18-bit bundles 8 to 15
727 words = simd_batch{ SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) };
728 shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 };
729 results = (words >> shifts) & masks;
730 results.store_unaligned(out);
731 out += 8;
732
733 // extract 18-bit bundles 16 to 23
734 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2 };
735 shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 };
736 results = (words >> shifts) & masks;
737 results.store_unaligned(out);
738 out += 8;
739
740 // extract 18-bit bundles 24 to 31
741 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) };
742 shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 };
743 results = (words >> shifts) & masks;
744 results.store_unaligned(out);
745 out += 8;
746
747 in += 18;
748 return in;
749 }
750
751 inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
752 uint32_t mask = 0x7ffff;
753
754 simd_batch masks(mask);
755 simd_batch words, shifts;
756 simd_batch results;
757
758 // extract 19-bit bundles 0 to 7
759 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 19 | SafeLoad<uint32_t>(in + 1) << 13, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 25 | SafeLoad<uint32_t>(in + 2) << 7, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1, SafeLoad<uint32_t>(in + 3) >> 18 | SafeLoad<uint32_t>(in + 4) << 14, SafeLoad<uint32_t>(in + 4) };
760 shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5 };
761 results = (words >> shifts) & masks;
762 results.store_unaligned(out);
763 out += 8;
764
765 // extract 19-bit bundles 8 to 15
766 words = simd_batch{ SafeLoad<uint32_t>(in + 4) >> 24 | SafeLoad<uint32_t>(in + 5) << 8, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2, SafeLoad<uint32_t>(in + 6) >> 17 | SafeLoad<uint32_t>(in + 7) << 15, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3 };
767 shifts = simd_batch{ 0, 11, 0, 0, 4, 0, 10, 0 };
768 results = (words >> shifts) & masks;
769 results.store_unaligned(out);
770 out += 8;
771
772 // extract 19-bit bundles 16 to 23
773 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 16 | SafeLoad<uint32_t>(in + 10) << 16, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11 };
774 shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0 };
775 results = (words >> shifts) & masks;
776 results.store_unaligned(out);
777 out += 8;
778
779 // extract 19-bit bundles 24 to 31
780 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6, SafeLoad<uint32_t>(in + 18) };
781 shifts = simd_batch{ 8, 0, 0, 1, 0, 7, 0, 13 };
782 results = (words >> shifts) & masks;
783 results.store_unaligned(out);
784 out += 8;
785
786 in += 19;
787 return in;
788 }
789
790 inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
791 uint32_t mask = 0xfffff;
792
793 simd_batch masks(mask);
794 simd_batch words, shifts;
795 simd_batch results;
796
797 // extract 20-bit bundles 0 to 7
798 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 20 | SafeLoad<uint32_t>(in + 1) << 12, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) >> 16 | SafeLoad<uint32_t>(in + 3) << 16, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) };
799 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 };
800 results = (words >> shifts) & masks;
801 results.store_unaligned(out);
802 out += 8;
803
804 // extract 20-bit bundles 8 to 15
805 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) };
806 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 };
807 results = (words >> shifts) & masks;
808 results.store_unaligned(out);
809 out += 8;
810
811 // extract 20-bit bundles 16 to 23
812 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4, SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14) };
813 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 };
814 results = (words >> shifts) & masks;
815 results.store_unaligned(out);
816 out += 8;
817
818 // extract 20-bit bundles 24 to 31
819 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) };
820 shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 };
821 results = (words >> shifts) & masks;
822 results.store_unaligned(out);
823 out += 8;
824
825 in += 20;
826 return in;
827 }
828
829 inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
830 uint32_t mask = 0x1fffff;
831
832 simd_batch masks(mask);
833 simd_batch words, shifts;
834 simd_batch results;
835
836 // extract 21-bit bundles 0 to 7
837 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 21 | SafeLoad<uint32_t>(in + 1) << 11, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) >> 19 | SafeLoad<uint32_t>(in + 5) << 13 };
838 shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0 };
839 results = (words >> shifts) & masks;
840 results.store_unaligned(out);
841 out += 8;
842
843 // extract 21-bit bundles 8 to 15
844 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 17 | SafeLoad<uint32_t>(in + 9) << 15, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5 };
845 shifts = simd_batch{ 8, 0, 0, 7, 0, 0, 6, 0 };
846 results = (words >> shifts) & masks;
847 results.store_unaligned(out);
848 out += 8;
849
850 // extract 21-bit bundles 16 to 23
851 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 25 | SafeLoad<uint32_t>(in + 14) << 7, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15) };
852 shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3 };
853 results = (words >> shifts) & masks;
854 results.store_unaligned(out);
855 out += 8;
856
857 // extract 21-bit bundles 24 to 31
858 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9, SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) };
859 shifts = simd_batch{ 0, 0, 2, 0, 0, 1, 0, 11 };
860 results = (words >> shifts) & masks;
861 results.store_unaligned(out);
862 out += 8;
863
864 in += 21;
865 return in;
866 }
867
868 inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
869 uint32_t mask = 0x3fffff;
870
871 simd_batch masks(mask);
872 simd_batch words, shifts;
873 simd_batch results;
874
875 // extract 22-bit bundles 0 to 7
876 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1) >> 12 | SafeLoad<uint32_t>(in + 2) << 20, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3) >> 14 | SafeLoad<uint32_t>(in + 4) << 18, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6 };
877 shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 };
878 results = (words >> shifts) & masks;
879 results.store_unaligned(out);
880 out += 8;
881
882 // extract 22-bit bundles 8 to 15
883 words = simd_batch{ SafeLoad<uint32_t>(in + 5) >> 16 | SafeLoad<uint32_t>(in + 6) << 16, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) >> 18 | SafeLoad<uint32_t>(in + 8) << 14, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) };
884 shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 };
885 results = (words >> shifts) & masks;
886 results.store_unaligned(out);
887 out += 8;
888
889 // extract 22-bit bundles 16 to 23
890 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12) >> 12 | SafeLoad<uint32_t>(in + 13) << 20, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6 };
891 shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 };
892 results = (words >> shifts) & masks;
893 results.store_unaligned(out);
894 out += 8;
895
896 // extract 22-bit bundles 24 to 31
897 words = simd_batch{ SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) };
898 shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 };
899 results = (words >> shifts) & masks;
900 results.store_unaligned(out);
901 out += 8;
902
903 in += 22;
904 return in;
905 }
906
907 inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
908 uint32_t mask = 0x7fffff;
909
910 simd_batch masks(mask);
911 simd_batch words, shifts;
912 simd_batch results;
913
914 // extract 23-bit bundles 0 to 7
915 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 23 | SafeLoad<uint32_t>(in + 1) << 9, SafeLoad<uint32_t>(in + 1) >> 14 | SafeLoad<uint32_t>(in + 2) << 18, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 28 | SafeLoad<uint32_t>(in + 3) << 4, SafeLoad<uint32_t>(in + 3) >> 19 | SafeLoad<uint32_t>(in + 4) << 13, SafeLoad<uint32_t>(in + 4) >> 10 | SafeLoad<uint32_t>(in + 5) << 22, SafeLoad<uint32_t>(in + 5) };
916 shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1 };
917 results = (words >> shifts) & masks;
918 results.store_unaligned(out);
919 out += 8;
920
921 // extract 23-bit bundles 8 to 15
922 words = simd_batch{ SafeLoad<uint32_t>(in + 5) >> 24 | SafeLoad<uint32_t>(in + 6) << 8, SafeLoad<uint32_t>(in + 6) >> 15 | SafeLoad<uint32_t>(in + 7) << 17, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3, SafeLoad<uint32_t>(in + 8) >> 20 | SafeLoad<uint32_t>(in + 9) << 12, SafeLoad<uint32_t>(in + 9) >> 11 | SafeLoad<uint32_t>(in + 10) << 21, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7 };
923 shifts = simd_batch{ 0, 0, 6, 0, 0, 0, 2, 0 };
924 results = (words >> shifts) & masks;
925 results.store_unaligned(out);
926 out += 8;
927
928 // extract 23-bit bundles 16 to 23
929 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 16 | SafeLoad<uint32_t>(in + 12) << 16, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2, SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11, SafeLoad<uint32_t>(in + 14) >> 12 | SafeLoad<uint32_t>(in + 15) << 20, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6, SafeLoad<uint32_t>(in + 16) >> 17 | SafeLoad<uint32_t>(in + 17) << 15 };
930 shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0 };
931 results = (words >> shifts) & masks;
932 results.store_unaligned(out);
933 out += 8;
934
935 // extract 23-bit bundles 24 to 31
936 words = simd_batch{ SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19, SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14, SafeLoad<uint32_t>(in + 22) };
937 shifts = simd_batch{ 8, 0, 0, 0, 4, 0, 0, 9 };
938 results = (words >> shifts) & masks;
939 results.store_unaligned(out);
940 out += 8;
941
942 in += 23;
943 return in;
944 }
945
946 inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
947 uint32_t mask = 0xffffff;
948
949 simd_batch masks(mask);
950 simd_batch words, shifts;
951 simd_batch results;
952
953 // extract 24-bit bundles 0 to 7
954 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1) >> 16 | SafeLoad<uint32_t>(in + 2) << 16, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5) };
955 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 };
956 results = (words >> shifts) & masks;
957 results.store_unaligned(out);
958 out += 8;
959
960 // extract 24-bit bundles 8 to 15
961 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) };
962 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 };
963 results = (words >> shifts) & masks;
964 results.store_unaligned(out);
965 out += 8;
966
967 // extract 24-bit bundles 16 to 23
968 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17) };
969 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 };
970 results = (words >> shifts) & masks;
971 results.store_unaligned(out);
972 out += 8;
973
974 // extract 24-bit bundles 24 to 31
975 words = simd_batch{ SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) };
976 shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 };
977 results = (words >> shifts) & masks;
978 results.store_unaligned(out);
979 out += 8;
980
981 in += 24;
982 return in;
983 }
984
985 inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
986 uint32_t mask = 0x1ffffff;
987
988 simd_batch masks(mask);
989 simd_batch words, shifts;
990 simd_batch results;
991
992 // extract 25-bit bundles 0 to 7
993 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 25 | SafeLoad<uint32_t>(in + 1) << 7, SafeLoad<uint32_t>(in + 1) >> 18 | SafeLoad<uint32_t>(in + 2) << 14, SafeLoad<uint32_t>(in + 2) >> 11 | SafeLoad<uint32_t>(in + 3) << 21, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 15 | SafeLoad<uint32_t>(in + 6) << 17 };
994 shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 };
995 results = (words >> shifts) & masks;
996 results.store_unaligned(out);
997 out += 8;
998
999 // extract 25-bit bundles 8 to 15
1000 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 8 | SafeLoad<uint32_t>(in + 7) << 24, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8) >> 19 | SafeLoad<uint32_t>(in + 9) << 13, SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9 };
1001 shifts = simd_batch{ 0, 1, 0, 0, 0, 5, 0, 0 };
1002 results = (words >> shifts) & masks;
1003 results.store_unaligned(out);
1004 out += 8;
1005
1006 // extract 25-bit bundles 16 to 23
1007 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13) >> 9 | SafeLoad<uint32_t>(in + 14) << 23, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5, SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1 };
1008 shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0 };
1009 results = (words >> shifts) & masks;
1010 results.store_unaligned(out);
1011 out += 8;
1012
1013 // extract 25-bit bundles 24 to 31
1014 words = simd_batch{ SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 17 | SafeLoad<uint32_t>(in + 20) << 15, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) };
1015 shifts = simd_batch{ 0, 0, 0, 3, 0, 0, 0, 7 };
1016 results = (words >> shifts) & masks;
1017 results.store_unaligned(out);
1018 out += 8;
1019
1020 in += 25;
1021 return in;
1022 }
1023
1024 inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
1025 uint32_t mask = 0x3ffffff;
1026
1027 simd_batch masks(mask);
1028 simd_batch words, shifts;
1029 simd_batch results;
1030
1031 // extract 26-bit bundles 0 to 7
1032 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2) >> 14 | SafeLoad<uint32_t>(in + 3) << 18, SafeLoad<uint32_t>(in + 3) >> 8 | SafeLoad<uint32_t>(in + 4) << 24, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10 };
1033 shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 };
1034 results = (words >> shifts) & masks;
1035 results.store_unaligned(out);
1036 out += 8;
1037
1038 // extract 26-bit bundles 8 to 15
1039 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 16 | SafeLoad<uint32_t>(in + 7) << 16, SafeLoad<uint32_t>(in + 7) >> 10 | SafeLoad<uint32_t>(in + 8) << 22, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 18 | SafeLoad<uint32_t>(in + 11) << 14, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) };
1040 shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 };
1041 results = (words >> shifts) & masks;
1042 results.store_unaligned(out);
1043 out += 8;
1044
1045 // extract 26-bit bundles 16 to 23
1046 words = simd_batch{ SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18, SafeLoad<uint32_t>(in + 16) >> 8 | SafeLoad<uint32_t>(in + 17) << 24, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10 };
1047 shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 };
1048 results = (words >> shifts) & masks;
1049 results.store_unaligned(out);
1050 out += 8;
1051
1052 // extract 26-bit bundles 24 to 31
1053 words = simd_batch{ SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2, SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) };
1054 shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 };
1055 results = (words >> shifts) & masks;
1056 results.store_unaligned(out);
1057 out += 8;
1058
1059 in += 26;
1060 return in;
1061 }
1062
1063 inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
1064 uint32_t mask = 0x7ffffff;
1065
1066 simd_batch masks(mask);
1067 simd_batch words, shifts;
1068 simd_batch results;
1069
1070 // extract 27-bit bundles 0 to 7
1071 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5, SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10, SafeLoad<uint32_t>(in + 2) >> 17 | SafeLoad<uint32_t>(in + 3) << 15, SafeLoad<uint32_t>(in + 3) >> 12 | SafeLoad<uint32_t>(in + 4) << 20, SafeLoad<uint32_t>(in + 4) >> 7 | SafeLoad<uint32_t>(in + 5) << 25, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3 };
1072 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0 };
1073 results = (words >> shifts) & masks;
1074 results.store_unaligned(out);
1075 out += 8;
1076
1077 // extract 27-bit bundles 8 to 15
1078 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 19 | SafeLoad<uint32_t>(in + 8) << 13, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 9 | SafeLoad<uint32_t>(in + 10) << 23, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1, SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11 };
1079 shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 };
1080 results = (words >> shifts) & masks;
1081 results.store_unaligned(out);
1082 out += 8;
1083
1084 // extract 27-bit bundles 16 to 23
1085 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14) >> 11 | SafeLoad<uint32_t>(in + 15) << 21, SafeLoad<uint32_t>(in + 15) >> 6 | SafeLoad<uint32_t>(in + 16) << 26, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19 };
1086 shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0 };
1087 results = (words >> shifts) & masks;
1088 results.store_unaligned(out);
1089 out += 8;
1090
1091 // extract 27-bit bundles 24 to 31
1092 words = simd_batch{ SafeLoad<uint32_t>(in + 20) >> 8 | SafeLoad<uint32_t>(in + 21) << 24, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2, SafeLoad<uint32_t>(in + 22) >> 25 | SafeLoad<uint32_t>(in + 23) << 7, SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22, SafeLoad<uint32_t>(in + 26) };
1093 shifts = simd_batch{ 0, 3, 0, 0, 0, 0, 0, 5 };
1094 results = (words >> shifts) & masks;
1095 results.store_unaligned(out);
1096 out += 8;
1097
1098 in += 27;
1099 return in;
1100 }
1101
1102 inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
1103 uint32_t mask = 0xfffffff;
1104
1105 simd_batch masks(mask);
1106 simd_batch words, shifts;
1107 simd_batch results;
1108
1109 // extract 28-bit bundles 0 to 7
1110 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3) >> 16 | SafeLoad<uint32_t>(in + 4) << 16, SafeLoad<uint32_t>(in + 4) >> 12 | SafeLoad<uint32_t>(in + 5) << 20, SafeLoad<uint32_t>(in + 5) >> 8 | SafeLoad<uint32_t>(in + 6) << 24, SafeLoad<uint32_t>(in + 6) };
1111 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 };
1112 results = (words >> shifts) & masks;
1113 results.store_unaligned(out);
1114 out += 8;
1115
1116 // extract 28-bit bundles 8 to 15
1117 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) >> 8 | SafeLoad<uint32_t>(in + 13) << 24, SafeLoad<uint32_t>(in + 13) };
1118 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 };
1119 results = (words >> shifts) & masks;
1120 results.store_unaligned(out);
1121 out += 8;
1122
1123 // extract 28-bit bundles 16 to 23
1124 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12, SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19) >> 8 | SafeLoad<uint32_t>(in + 20) << 24, SafeLoad<uint32_t>(in + 20) };
1125 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 };
1126 results = (words >> shifts) & masks;
1127 results.store_unaligned(out);
1128 out += 8;
1129
1130 // extract 28-bit bundles 24 to 31
1131 words = simd_batch{ SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12, SafeLoad<uint32_t>(in + 24) >> 16 | SafeLoad<uint32_t>(in + 25) << 16, SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) };
1132 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 };
1133 results = (words >> shifts) & masks;
1134 results.store_unaligned(out);
1135 out += 8;
1136
1137 in += 28;
1138 return in;
1139 }
1140
1141 inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
1142 uint32_t mask = 0x1fffffff;
1143
1144 simd_batch masks(mask);
1145 simd_batch words, shifts;
1146 simd_batch results;
1147
1148 // extract 29-bit bundles 0 to 7
1149 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 29 | SafeLoad<uint32_t>(in + 1) << 3, SafeLoad<uint32_t>(in + 1) >> 26 | SafeLoad<uint32_t>(in + 2) << 6, SafeLoad<uint32_t>(in + 2) >> 23 | SafeLoad<uint32_t>(in + 3) << 9, SafeLoad<uint32_t>(in + 3) >> 20 | SafeLoad<uint32_t>(in + 4) << 12, SafeLoad<uint32_t>(in + 4) >> 17 | SafeLoad<uint32_t>(in + 5) << 15, SafeLoad<uint32_t>(in + 5) >> 14 | SafeLoad<uint32_t>(in + 6) << 18, SafeLoad<uint32_t>(in + 6) >> 11 | SafeLoad<uint32_t>(in + 7) << 21 };
1150 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1151 results = (words >> shifts) & masks;
1152 results.store_unaligned(out);
1153 out += 8;
1154
1155 // extract 29-bit bundles 8 to 15
1156 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 8 | SafeLoad<uint32_t>(in + 8) << 24, SafeLoad<uint32_t>(in + 8) >> 5 | SafeLoad<uint32_t>(in + 9) << 27, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1, SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13 };
1157 shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 0 };
1158 results = (words >> shifts) & masks;
1159 results.store_unaligned(out);
1160 out += 8;
1161
1162 // extract 29-bit bundles 16 to 23
1163 words = simd_batch{ SafeLoad<uint32_t>(in + 14) >> 16 | SafeLoad<uint32_t>(in + 15) << 16, SafeLoad<uint32_t>(in + 15) >> 13 | SafeLoad<uint32_t>(in + 16) << 19, SafeLoad<uint32_t>(in + 16) >> 10 | SafeLoad<uint32_t>(in + 17) << 22, SafeLoad<uint32_t>(in + 17) >> 7 | SafeLoad<uint32_t>(in + 18) << 25, SafeLoad<uint32_t>(in + 18) >> 4 | SafeLoad<uint32_t>(in + 19) << 28, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5 };
1164 shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 };
1165 results = (words >> shifts) & masks;
1166 results.store_unaligned(out);
1167 out += 8;
1168
1169 // extract 29-bit bundles 24 to 31
1170 words = simd_batch{ SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17, SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 9 | SafeLoad<uint32_t>(in + 27) << 23, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) };
1171 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 3 };
1172 results = (words >> shifts) & masks;
1173 results.store_unaligned(out);
1174 out += 8;
1175
1176 in += 29;
1177 return in;
1178 }
1179
1180 inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
1181 uint32_t mask = 0x3fffffff;
1182
1183 simd_batch masks(mask);
1184 simd_batch words, shifts;
1185 simd_batch results;
1186
1187 // extract 30-bit bundles 0 to 7
1188 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14 };
1189 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1190 results = (words >> shifts) & masks;
1191 results.store_unaligned(out);
1192 out += 8;
1193
1194 // extract 30-bit bundles 8 to 15
1195 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10) >> 10 | SafeLoad<uint32_t>(in + 11) << 22, SafeLoad<uint32_t>(in + 11) >> 8 | SafeLoad<uint32_t>(in + 12) << 24, SafeLoad<uint32_t>(in + 12) >> 6 | SafeLoad<uint32_t>(in + 13) << 26, SafeLoad<uint32_t>(in + 13) >> 4 | SafeLoad<uint32_t>(in + 14) << 28, SafeLoad<uint32_t>(in + 14) };
1196 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 };
1197 results = (words >> shifts) & masks;
1198 results.store_unaligned(out);
1199 out += 8;
1200
1201 // extract 30-bit bundles 16 to 23
1202 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6, SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14 };
1203 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1204 results = (words >> shifts) & masks;
1205 results.store_unaligned(out);
1206 out += 8;
1207
1208 // extract 30-bit bundles 24 to 31
1209 words = simd_batch{ SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22, SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) >> 4 | SafeLoad<uint32_t>(in + 29) << 28, SafeLoad<uint32_t>(in + 29) };
1210 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 };
1211 results = (words >> shifts) & masks;
1212 results.store_unaligned(out);
1213 out += 8;
1214
1215 in += 30;
1216 return in;
1217 }
1218
1219 inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
1220 uint32_t mask = 0x7fffffff;
1221
1222 simd_batch masks(mask);
1223 simd_batch words, shifts;
1224 simd_batch results;
1225
1226 // extract 31-bit bundles 0 to 7
1227 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 31 | SafeLoad<uint32_t>(in + 1) << 1, SafeLoad<uint32_t>(in + 1) >> 30 | SafeLoad<uint32_t>(in + 2) << 2, SafeLoad<uint32_t>(in + 2) >> 29 | SafeLoad<uint32_t>(in + 3) << 3, SafeLoad<uint32_t>(in + 3) >> 28 | SafeLoad<uint32_t>(in + 4) << 4, SafeLoad<uint32_t>(in + 4) >> 27 | SafeLoad<uint32_t>(in + 5) << 5, SafeLoad<uint32_t>(in + 5) >> 26 | SafeLoad<uint32_t>(in + 6) << 6, SafeLoad<uint32_t>(in + 6) >> 25 | SafeLoad<uint32_t>(in + 7) << 7 };
1228 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1229 results = (words >> shifts) & masks;
1230 results.store_unaligned(out);
1231 out += 8;
1232
1233 // extract 31-bit bundles 8 to 15
1234 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 24 | SafeLoad<uint32_t>(in + 8) << 8, SafeLoad<uint32_t>(in + 8) >> 23 | SafeLoad<uint32_t>(in + 9) << 9, SafeLoad<uint32_t>(in + 9) >> 22 | SafeLoad<uint32_t>(in + 10) << 10, SafeLoad<uint32_t>(in + 10) >> 21 | SafeLoad<uint32_t>(in + 11) << 11, SafeLoad<uint32_t>(in + 11) >> 20 | SafeLoad<uint32_t>(in + 12) << 12, SafeLoad<uint32_t>(in + 12) >> 19 | SafeLoad<uint32_t>(in + 13) << 13, SafeLoad<uint32_t>(in + 13) >> 18 | SafeLoad<uint32_t>(in + 14) << 14, SafeLoad<uint32_t>(in + 14) >> 17 | SafeLoad<uint32_t>(in + 15) << 15 };
1235 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1236 results = (words >> shifts) & masks;
1237 results.store_unaligned(out);
1238 out += 8;
1239
1240 // extract 31-bit bundles 16 to 23
1241 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 16 | SafeLoad<uint32_t>(in + 16) << 16, SafeLoad<uint32_t>(in + 16) >> 15 | SafeLoad<uint32_t>(in + 17) << 17, SafeLoad<uint32_t>(in + 17) >> 14 | SafeLoad<uint32_t>(in + 18) << 18, SafeLoad<uint32_t>(in + 18) >> 13 | SafeLoad<uint32_t>(in + 19) << 19, SafeLoad<uint32_t>(in + 19) >> 12 | SafeLoad<uint32_t>(in + 20) << 20, SafeLoad<uint32_t>(in + 20) >> 11 | SafeLoad<uint32_t>(in + 21) << 21, SafeLoad<uint32_t>(in + 21) >> 10 | SafeLoad<uint32_t>(in + 22) << 22, SafeLoad<uint32_t>(in + 22) >> 9 | SafeLoad<uint32_t>(in + 23) << 23 };
1242 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 };
1243 results = (words >> shifts) & masks;
1244 results.store_unaligned(out);
1245 out += 8;
1246
1247 // extract 31-bit bundles 24 to 31
1248 words = simd_batch{ SafeLoad<uint32_t>(in + 23) >> 8 | SafeLoad<uint32_t>(in + 24) << 24, SafeLoad<uint32_t>(in + 24) >> 7 | SafeLoad<uint32_t>(in + 25) << 25, SafeLoad<uint32_t>(in + 25) >> 6 | SafeLoad<uint32_t>(in + 26) << 26, SafeLoad<uint32_t>(in + 26) >> 5 | SafeLoad<uint32_t>(in + 27) << 27, SafeLoad<uint32_t>(in + 27) >> 4 | SafeLoad<uint32_t>(in + 28) << 28, SafeLoad<uint32_t>(in + 28) >> 3 | SafeLoad<uint32_t>(in + 29) << 29, SafeLoad<uint32_t>(in + 29) >> 2 | SafeLoad<uint32_t>(in + 30) << 30, SafeLoad<uint32_t>(in + 30) };
1249 shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 };
1250 results = (words >> shifts) & masks;
1251 results.store_unaligned(out);
1252 out += 8;
1253
1254 in += 31;
1255 return in;
1256 }
1257
1258 inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
1259 memcpy(out, in, 32 * sizeof(*out));
1260 in += 32;
1261 out += 32;
1262
1263 return in;
1264 }
1265
1266 }; // struct UnpackBits256
1267
1268 } // namespace
1269 } // namespace internal
1270 } // namespace arrow
1271