]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/util/bpacking_simd128_generated.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / bpacking_simd128_generated.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// Automatically generated file; DO NOT EDIT.
19
20#pragma once
21
22#include <cstdint>
23#include <cstring>
24
25#include <xsimd/xsimd.hpp>
26
27#include "arrow/util/dispatch.h"
28#include "arrow/util/ubsan.h"
29
30namespace arrow {
31namespace internal {
32namespace {
33
34using ::arrow::util::SafeLoad;
35
36template <DispatchLevel level>
37struct UnpackBits128 {
38
39#ifdef ARROW_HAVE_NEON
40using simd_arch = xsimd::neon64;
41#else
42using simd_arch = xsimd::sse4_2;
43#endif
44
45using simd_batch = xsimd::batch<uint32_t, simd_arch>;
46
47inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) {
48 memset(out, 0x0, 32 * sizeof(*out));
49 out += 32;
50
51 return in;
52}
53
54inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
55 uint32_t mask = 0x1;
56
57 simd_batch masks(mask);
58 simd_batch words, shifts;
59 simd_batch results;
60
61 // extract 1-bit bundles 0 to 3
62 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
63 shifts = simd_batch{ 0, 1, 2, 3 };
64 results = (words >> shifts) & masks;
65 results.store_unaligned(out);
66 out += 4;
67
68 // extract 1-bit bundles 4 to 7
69 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
70 shifts = simd_batch{ 4, 5, 6, 7 };
71 results = (words >> shifts) & masks;
72 results.store_unaligned(out);
73 out += 4;
74
75 // extract 1-bit bundles 8 to 11
76 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
77 shifts = simd_batch{ 8, 9, 10, 11 };
78 results = (words >> shifts) & masks;
79 results.store_unaligned(out);
80 out += 4;
81
82 // extract 1-bit bundles 12 to 15
83 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
84 shifts = simd_batch{ 12, 13, 14, 15 };
85 results = (words >> shifts) & masks;
86 results.store_unaligned(out);
87 out += 4;
88
89 // extract 1-bit bundles 16 to 19
90 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
91 shifts = simd_batch{ 16, 17, 18, 19 };
92 results = (words >> shifts) & masks;
93 results.store_unaligned(out);
94 out += 4;
95
96 // extract 1-bit bundles 20 to 23
97 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
98 shifts = simd_batch{ 20, 21, 22, 23 };
99 results = (words >> shifts) & masks;
100 results.store_unaligned(out);
101 out += 4;
102
103 // extract 1-bit bundles 24 to 27
104 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
105 shifts = simd_batch{ 24, 25, 26, 27 };
106 results = (words >> shifts) & masks;
107 results.store_unaligned(out);
108 out += 4;
109
110 // extract 1-bit bundles 28 to 31
111 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
112 shifts = simd_batch{ 28, 29, 30, 31 };
113 results = (words >> shifts) & masks;
114 results.store_unaligned(out);
115 out += 4;
116
117 in += 1;
118 return in;
119}
120
121inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
122 uint32_t mask = 0x3;
123
124 simd_batch masks(mask);
125 simd_batch words, shifts;
126 simd_batch results;
127
128 // extract 2-bit bundles 0 to 3
129 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
130 shifts = simd_batch{ 0, 2, 4, 6 };
131 results = (words >> shifts) & masks;
132 results.store_unaligned(out);
133 out += 4;
134
135 // extract 2-bit bundles 4 to 7
136 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
137 shifts = simd_batch{ 8, 10, 12, 14 };
138 results = (words >> shifts) & masks;
139 results.store_unaligned(out);
140 out += 4;
141
142 // extract 2-bit bundles 8 to 11
143 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
144 shifts = simd_batch{ 16, 18, 20, 22 };
145 results = (words >> shifts) & masks;
146 results.store_unaligned(out);
147 out += 4;
148
149 // extract 2-bit bundles 12 to 15
150 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
151 shifts = simd_batch{ 24, 26, 28, 30 };
152 results = (words >> shifts) & masks;
153 results.store_unaligned(out);
154 out += 4;
155
156 // extract 2-bit bundles 16 to 19
157 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
158 shifts = simd_batch{ 0, 2, 4, 6 };
159 results = (words >> shifts) & masks;
160 results.store_unaligned(out);
161 out += 4;
162
163 // extract 2-bit bundles 20 to 23
164 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
165 shifts = simd_batch{ 8, 10, 12, 14 };
166 results = (words >> shifts) & masks;
167 results.store_unaligned(out);
168 out += 4;
169
170 // extract 2-bit bundles 24 to 27
171 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
172 shifts = simd_batch{ 16, 18, 20, 22 };
173 results = (words >> shifts) & masks;
174 results.store_unaligned(out);
175 out += 4;
176
177 // extract 2-bit bundles 28 to 31
178 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
179 shifts = simd_batch{ 24, 26, 28, 30 };
180 results = (words >> shifts) & masks;
181 results.store_unaligned(out);
182 out += 4;
183
184 in += 2;
185 return in;
186}
187
188inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
189 uint32_t mask = 0x7;
190
191 simd_batch masks(mask);
192 simd_batch words, shifts;
193 simd_batch results;
194
195 // extract 3-bit bundles 0 to 3
196 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
197 shifts = simd_batch{ 0, 3, 6, 9 };
198 results = (words >> shifts) & masks;
199 results.store_unaligned(out);
200 out += 4;
201
202 // extract 3-bit bundles 4 to 7
203 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
204 shifts = simd_batch{ 12, 15, 18, 21 };
205 results = (words >> shifts) & masks;
206 results.store_unaligned(out);
207 out += 4;
208
209 // extract 3-bit bundles 8 to 11
210 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) };
211 shifts = simd_batch{ 24, 27, 0, 1 };
212 results = (words >> shifts) & masks;
213 results.store_unaligned(out);
214 out += 4;
215
216 // extract 3-bit bundles 12 to 15
217 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
218 shifts = simd_batch{ 4, 7, 10, 13 };
219 results = (words >> shifts) & masks;
220 results.store_unaligned(out);
221 out += 4;
222
223 // extract 3-bit bundles 16 to 19
224 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
225 shifts = simd_batch{ 16, 19, 22, 25 };
226 results = (words >> shifts) & masks;
227 results.store_unaligned(out);
228 out += 4;
229
230 // extract 3-bit bundles 20 to 23
231 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
232 shifts = simd_batch{ 28, 0, 2, 5 };
233 results = (words >> shifts) & masks;
234 results.store_unaligned(out);
235 out += 4;
236
237 // extract 3-bit bundles 24 to 27
238 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
239 shifts = simd_batch{ 8, 11, 14, 17 };
240 results = (words >> shifts) & masks;
241 results.store_unaligned(out);
242 out += 4;
243
244 // extract 3-bit bundles 28 to 31
245 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
246 shifts = simd_batch{ 20, 23, 26, 29 };
247 results = (words >> shifts) & masks;
248 results.store_unaligned(out);
249 out += 4;
250
251 in += 3;
252 return in;
253}
254
255inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
256 uint32_t mask = 0xf;
257
258 simd_batch masks(mask);
259 simd_batch words, shifts;
260 simd_batch results;
261
262 // extract 4-bit bundles 0 to 3
263 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
264 shifts = simd_batch{ 0, 4, 8, 12 };
265 results = (words >> shifts) & masks;
266 results.store_unaligned(out);
267 out += 4;
268
269 // extract 4-bit bundles 4 to 7
270 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
271 shifts = simd_batch{ 16, 20, 24, 28 };
272 results = (words >> shifts) & masks;
273 results.store_unaligned(out);
274 out += 4;
275
276 // extract 4-bit bundles 8 to 11
277 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
278 shifts = simd_batch{ 0, 4, 8, 12 };
279 results = (words >> shifts) & masks;
280 results.store_unaligned(out);
281 out += 4;
282
283 // extract 4-bit bundles 12 to 15
284 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
285 shifts = simd_batch{ 16, 20, 24, 28 };
286 results = (words >> shifts) & masks;
287 results.store_unaligned(out);
288 out += 4;
289
290 // extract 4-bit bundles 16 to 19
291 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
292 shifts = simd_batch{ 0, 4, 8, 12 };
293 results = (words >> shifts) & masks;
294 results.store_unaligned(out);
295 out += 4;
296
297 // extract 4-bit bundles 20 to 23
298 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
299 shifts = simd_batch{ 16, 20, 24, 28 };
300 results = (words >> shifts) & masks;
301 results.store_unaligned(out);
302 out += 4;
303
304 // extract 4-bit bundles 24 to 27
305 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
306 shifts = simd_batch{ 0, 4, 8, 12 };
307 results = (words >> shifts) & masks;
308 results.store_unaligned(out);
309 out += 4;
310
311 // extract 4-bit bundles 28 to 31
312 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
313 shifts = simd_batch{ 16, 20, 24, 28 };
314 results = (words >> shifts) & masks;
315 results.store_unaligned(out);
316 out += 4;
317
318 in += 4;
319 return in;
320}
321
322inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
323 uint32_t mask = 0x1f;
324
325 simd_batch masks(mask);
326 simd_batch words, shifts;
327 simd_batch results;
328
329 // extract 5-bit bundles 0 to 3
330 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
331 shifts = simd_batch{ 0, 5, 10, 15 };
332 results = (words >> shifts) & masks;
333 results.store_unaligned(out);
334 out += 4;
335
336 // extract 5-bit bundles 4 to 7
337 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) };
338 shifts = simd_batch{ 20, 25, 0, 3 };
339 results = (words >> shifts) & masks;
340 results.store_unaligned(out);
341 out += 4;
342
343 // extract 5-bit bundles 8 to 11
344 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
345 shifts = simd_batch{ 8, 13, 18, 23 };
346 results = (words >> shifts) & masks;
347 results.store_unaligned(out);
348 out += 4;
349
350 // extract 5-bit bundles 12 to 15
351 words = simd_batch{ SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
352 shifts = simd_batch{ 0, 1, 6, 11 };
353 results = (words >> shifts) & masks;
354 results.store_unaligned(out);
355 out += 4;
356
357 // extract 5-bit bundles 16 to 19
358 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1 };
359 shifts = simd_batch{ 16, 21, 26, 0 };
360 results = (words >> shifts) & masks;
361 results.store_unaligned(out);
362 out += 4;
363
364 // extract 5-bit bundles 20 to 23
365 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
366 shifts = simd_batch{ 4, 9, 14, 19 };
367 results = (words >> shifts) & masks;
368 results.store_unaligned(out);
369 out += 4;
370
371 // extract 5-bit bundles 24 to 27
372 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
373 shifts = simd_batch{ 24, 0, 2, 7 };
374 results = (words >> shifts) & masks;
375 results.store_unaligned(out);
376 out += 4;
377
378 // extract 5-bit bundles 28 to 31
379 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
380 shifts = simd_batch{ 12, 17, 22, 27 };
381 results = (words >> shifts) & masks;
382 results.store_unaligned(out);
383 out += 4;
384
385 in += 5;
386 return in;
387}
388
389inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
390 uint32_t mask = 0x3f;
391
392 simd_batch masks(mask);
393 simd_batch words, shifts;
394 simd_batch results;
395
396 // extract 6-bit bundles 0 to 3
397 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
398 shifts = simd_batch{ 0, 6, 12, 18 };
399 results = (words >> shifts) & masks;
400 results.store_unaligned(out);
401 out += 4;
402
403 // extract 6-bit bundles 4 to 7
404 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
405 shifts = simd_batch{ 24, 0, 4, 10 };
406 results = (words >> shifts) & masks;
407 results.store_unaligned(out);
408 out += 4;
409
410 // extract 6-bit bundles 8 to 11
411 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) };
412 shifts = simd_batch{ 16, 22, 0, 2 };
413 results = (words >> shifts) & masks;
414 results.store_unaligned(out);
415 out += 4;
416
417 // extract 6-bit bundles 12 to 15
418 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
419 shifts = simd_batch{ 8, 14, 20, 26 };
420 results = (words >> shifts) & masks;
421 results.store_unaligned(out);
422 out += 4;
423
424 // extract 6-bit bundles 16 to 19
425 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
426 shifts = simd_batch{ 0, 6, 12, 18 };
427 results = (words >> shifts) & masks;
428 results.store_unaligned(out);
429 out += 4;
430
431 // extract 6-bit bundles 20 to 23
432 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
433 shifts = simd_batch{ 24, 0, 4, 10 };
434 results = (words >> shifts) & masks;
435 results.store_unaligned(out);
436 out += 4;
437
438 // extract 6-bit bundles 24 to 27
439 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5) };
440 shifts = simd_batch{ 16, 22, 0, 2 };
441 results = (words >> shifts) & masks;
442 results.store_unaligned(out);
443 out += 4;
444
445 // extract 6-bit bundles 28 to 31
446 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
447 shifts = simd_batch{ 8, 14, 20, 26 };
448 results = (words >> shifts) & masks;
449 results.store_unaligned(out);
450 out += 4;
451
452 in += 6;
453 return in;
454}
455
456inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
457 uint32_t mask = 0x7f;
458
459 simd_batch masks(mask);
460 simd_batch words, shifts;
461 simd_batch results;
462
463 // extract 7-bit bundles 0 to 3
464 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
465 shifts = simd_batch{ 0, 7, 14, 21 };
466 results = (words >> shifts) & masks;
467 results.store_unaligned(out);
468 out += 4;
469
470 // extract 7-bit bundles 4 to 7
471 words = simd_batch{ SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
472 shifts = simd_batch{ 0, 3, 10, 17 };
473 results = (words >> shifts) & masks;
474 results.store_unaligned(out);
475 out += 4;
476
477 // extract 7-bit bundles 8 to 11
478 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
479 shifts = simd_batch{ 24, 0, 6, 13 };
480 results = (words >> shifts) & masks;
481 results.store_unaligned(out);
482 out += 4;
483
484 // extract 7-bit bundles 12 to 15
485 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
486 shifts = simd_batch{ 20, 0, 2, 9 };
487 results = (words >> shifts) & masks;
488 results.store_unaligned(out);
489 out += 4;
490
491 // extract 7-bit bundles 16 to 19
492 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) };
493 shifts = simd_batch{ 16, 23, 0, 5 };
494 results = (words >> shifts) & masks;
495 results.store_unaligned(out);
496 out += 4;
497
498 // extract 7-bit bundles 20 to 23
499 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) };
500 shifts = simd_batch{ 12, 19, 0, 1 };
501 results = (words >> shifts) & masks;
502 results.store_unaligned(out);
503 out += 4;
504
505 // extract 7-bit bundles 24 to 27
506 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3 };
507 shifts = simd_batch{ 8, 15, 22, 0 };
508 results = (words >> shifts) & masks;
509 results.store_unaligned(out);
510 out += 4;
511
512 // extract 7-bit bundles 28 to 31
513 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
514 shifts = simd_batch{ 4, 11, 18, 25 };
515 results = (words >> shifts) & masks;
516 results.store_unaligned(out);
517 out += 4;
518
519 in += 7;
520 return in;
521}
522
523inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
524 uint32_t mask = 0xff;
525
526 simd_batch masks(mask);
527 simd_batch words, shifts;
528 simd_batch results;
529
530 // extract 8-bit bundles 0 to 3
531 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) };
532 shifts = simd_batch{ 0, 8, 16, 24 };
533 results = (words >> shifts) & masks;
534 results.store_unaligned(out);
535 out += 4;
536
537 // extract 8-bit bundles 4 to 7
538 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
539 shifts = simd_batch{ 0, 8, 16, 24 };
540 results = (words >> shifts) & masks;
541 results.store_unaligned(out);
542 out += 4;
543
544 // extract 8-bit bundles 8 to 11
545 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
546 shifts = simd_batch{ 0, 8, 16, 24 };
547 results = (words >> shifts) & masks;
548 results.store_unaligned(out);
549 out += 4;
550
551 // extract 8-bit bundles 12 to 15
552 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
553 shifts = simd_batch{ 0, 8, 16, 24 };
554 results = (words >> shifts) & masks;
555 results.store_unaligned(out);
556 out += 4;
557
558 // extract 8-bit bundles 16 to 19
559 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
560 shifts = simd_batch{ 0, 8, 16, 24 };
561 results = (words >> shifts) & masks;
562 results.store_unaligned(out);
563 out += 4;
564
565 // extract 8-bit bundles 20 to 23
566 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
567 shifts = simd_batch{ 0, 8, 16, 24 };
568 results = (words >> shifts) & masks;
569 results.store_unaligned(out);
570 out += 4;
571
572 // extract 8-bit bundles 24 to 27
573 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
574 shifts = simd_batch{ 0, 8, 16, 24 };
575 results = (words >> shifts) & masks;
576 results.store_unaligned(out);
577 out += 4;
578
579 // extract 8-bit bundles 28 to 31
580 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
581 shifts = simd_batch{ 0, 8, 16, 24 };
582 results = (words >> shifts) & masks;
583 results.store_unaligned(out);
584 out += 4;
585
586 in += 8;
587 return in;
588}
589
590inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
591 uint32_t mask = 0x1ff;
592
593 simd_batch masks(mask);
594 simd_batch words, shifts;
595 simd_batch results;
596
597 // extract 9-bit bundles 0 to 3
598 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5 };
599 shifts = simd_batch{ 0, 9, 18, 0 };
600 results = (words >> shifts) & masks;
601 results.store_unaligned(out);
602 out += 4;
603
604 // extract 9-bit bundles 4 to 7
605 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1 };
606 shifts = simd_batch{ 4, 13, 22, 0 };
607 results = (words >> shifts) & masks;
608 results.store_unaligned(out);
609 out += 4;
610
611 // extract 9-bit bundles 8 to 11
612 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3) };
613 shifts = simd_batch{ 8, 17, 0, 3 };
614 results = (words >> shifts) & masks;
615 results.store_unaligned(out);
616 out += 4;
617
618 // extract 9-bit bundles 12 to 15
619 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) };
620 shifts = simd_batch{ 12, 21, 0, 7 };
621 results = (words >> shifts) & masks;
622 results.store_unaligned(out);
623 out += 4;
624
625 // extract 9-bit bundles 16 to 19
626 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
627 shifts = simd_batch{ 16, 0, 2, 11 };
628 results = (words >> shifts) & masks;
629 results.store_unaligned(out);
630 out += 4;
631
632 // extract 9-bit bundles 20 to 23
633 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
634 shifts = simd_batch{ 20, 0, 6, 15 };
635 results = (words >> shifts) & masks;
636 results.store_unaligned(out);
637 out += 4;
638
639 // extract 9-bit bundles 24 to 27
640 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
641 shifts = simd_batch{ 0, 1, 10, 19 };
642 results = (words >> shifts) & masks;
643 results.store_unaligned(out);
644 out += 4;
645
646 // extract 9-bit bundles 28 to 31
647 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
648 shifts = simd_batch{ 0, 5, 14, 23 };
649 results = (words >> shifts) & masks;
650 results.store_unaligned(out);
651 out += 4;
652
653 in += 9;
654 return in;
655}
656
657inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
658 uint32_t mask = 0x3ff;
659
660 simd_batch masks(mask);
661 simd_batch words, shifts;
662 simd_batch results;
663
664 // extract 10-bit bundles 0 to 3
665 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2 };
666 shifts = simd_batch{ 0, 10, 20, 0 };
667 results = (words >> shifts) & masks;
668 results.store_unaligned(out);
669 out += 4;
670
671 // extract 10-bit bundles 4 to 7
672 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) };
673 shifts = simd_batch{ 8, 18, 0, 6 };
674 results = (words >> shifts) & masks;
675 results.store_unaligned(out);
676 out += 4;
677
678 // extract 10-bit bundles 8 to 11
679 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
680 shifts = simd_batch{ 16, 0, 4, 14 };
681 results = (words >> shifts) & masks;
682 results.store_unaligned(out);
683 out += 4;
684
685 // extract 10-bit bundles 12 to 15
686 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
687 shifts = simd_batch{ 0, 2, 12, 22 };
688 results = (words >> shifts) & masks;
689 results.store_unaligned(out);
690 out += 4;
691
692 // extract 10-bit bundles 16 to 19
693 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2 };
694 shifts = simd_batch{ 0, 10, 20, 0 };
695 results = (words >> shifts) & masks;
696 results.store_unaligned(out);
697 out += 4;
698
699 // extract 10-bit bundles 20 to 23
700 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) };
701 shifts = simd_batch{ 8, 18, 0, 6 };
702 results = (words >> shifts) & masks;
703 results.store_unaligned(out);
704 out += 4;
705
706 // extract 10-bit bundles 24 to 27
707 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
708 shifts = simd_batch{ 16, 0, 4, 14 };
709 results = (words >> shifts) & masks;
710 results.store_unaligned(out);
711 out += 4;
712
713 // extract 10-bit bundles 28 to 31
714 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) };
715 shifts = simd_batch{ 0, 2, 12, 22 };
716 results = (words >> shifts) & masks;
717 results.store_unaligned(out);
718 out += 4;
719
720 in += 10;
721 return in;
722}
723
724inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
725 uint32_t mask = 0x7ff;
726
727 simd_batch masks(mask);
728 simd_batch words, shifts;
729 simd_batch results;
730
731 // extract 11-bit bundles 0 to 3
732 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1) };
733 shifts = simd_batch{ 0, 11, 0, 1 };
734 results = (words >> shifts) & masks;
735 results.store_unaligned(out);
736 out += 4;
737
738 // extract 11-bit bundles 4 to 7
739 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 23 | SafeLoad<uint32_t>(in + 2) << 9, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
740 shifts = simd_batch{ 12, 0, 2, 13 };
741 results = (words >> shifts) & masks;
742 results.store_unaligned(out);
743 out += 4;
744
745 // extract 11-bit bundles 8 to 11
746 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 25 | SafeLoad<uint32_t>(in + 4) << 7 };
747 shifts = simd_batch{ 0, 3, 14, 0 };
748 results = (words >> shifts) & masks;
749 results.store_unaligned(out);
750 out += 4;
751
752 // extract 11-bit bundles 12 to 15
753 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6, SafeLoad<uint32_t>(in + 5) };
754 shifts = simd_batch{ 4, 15, 0, 5 };
755 results = (words >> shifts) & masks;
756 results.store_unaligned(out);
757 out += 4;
758
759 // extract 11-bit bundles 16 to 19
760 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
761 shifts = simd_batch{ 16, 0, 6, 17 };
762 results = (words >> shifts) & masks;
763 results.store_unaligned(out);
764 out += 4;
765
766 // extract 11-bit bundles 20 to 23
767 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3 };
768 shifts = simd_batch{ 0, 7, 18, 0 };
769 results = (words >> shifts) & masks;
770 results.store_unaligned(out);
771 out += 4;
772
773 // extract 11-bit bundles 24 to 27
774 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) };
775 shifts = simd_batch{ 8, 19, 0, 9 };
776 results = (words >> shifts) & masks;
777 results.store_unaligned(out);
778 out += 4;
779
780 // extract 11-bit bundles 28 to 31
781 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) };
782 shifts = simd_batch{ 20, 0, 10, 21 };
783 results = (words >> shifts) & masks;
784 results.store_unaligned(out);
785 out += 4;
786
787 in += 11;
788 return in;
789}
790
791inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
792 uint32_t mask = 0xfff;
793
794 simd_batch masks(mask);
795 simd_batch words, shifts;
796 simd_batch results;
797
798 // extract 12-bit bundles 0 to 3
799 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1) };
800 shifts = simd_batch{ 0, 12, 0, 4 };
801 results = (words >> shifts) & masks;
802 results.store_unaligned(out);
803 out += 4;
804
805 // extract 12-bit bundles 4 to 7
806 words = simd_batch{ SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) };
807 shifts = simd_batch{ 16, 0, 8, 20 };
808 results = (words >> shifts) & masks;
809 results.store_unaligned(out);
810 out += 4;
811
812 // extract 12-bit bundles 8 to 11
813 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) };
814 shifts = simd_batch{ 0, 12, 0, 4 };
815 results = (words >> shifts) & masks;
816 results.store_unaligned(out);
817 out += 4;
818
819 // extract 12-bit bundles 12 to 15
820 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
821 shifts = simd_batch{ 16, 0, 8, 20 };
822 results = (words >> shifts) & masks;
823 results.store_unaligned(out);
824 out += 4;
825
826 // extract 12-bit bundles 16 to 19
827 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) };
828 shifts = simd_batch{ 0, 12, 0, 4 };
829 results = (words >> shifts) & masks;
830 results.store_unaligned(out);
831 out += 4;
832
833 // extract 12-bit bundles 20 to 23
834 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) };
835 shifts = simd_batch{ 16, 0, 8, 20 };
836 results = (words >> shifts) & masks;
837 results.store_unaligned(out);
838 out += 4;
839
840 // extract 12-bit bundles 24 to 27
841 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) };
842 shifts = simd_batch{ 0, 12, 0, 4 };
843 results = (words >> shifts) & masks;
844 results.store_unaligned(out);
845 out += 4;
846
847 // extract 12-bit bundles 28 to 31
848 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) };
849 shifts = simd_batch{ 16, 0, 8, 20 };
850 results = (words >> shifts) & masks;
851 results.store_unaligned(out);
852 out += 4;
853
854 in += 12;
855 return in;
856}
857
858inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
859 uint32_t mask = 0x1fff;
860
861 simd_batch masks(mask);
862 simd_batch words, shifts;
863 simd_batch results;
864
865 // extract 13-bit bundles 0 to 3
866 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1) };
867 shifts = simd_batch{ 0, 13, 0, 7 };
868 results = (words >> shifts) & masks;
869 results.store_unaligned(out);
870 out += 4;
871
872 // extract 13-bit bundles 4 to 7
873 words = simd_batch{ SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 27 | SafeLoad<uint32_t>(in + 3) << 5 };
874 shifts = simd_batch{ 0, 1, 14, 0 };
875 results = (words >> shifts) & masks;
876 results.store_unaligned(out);
877 out += 4;
878
879 // extract 13-bit bundles 8 to 11
880 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 21 | SafeLoad<uint32_t>(in + 4) << 11, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) };
881 shifts = simd_batch{ 8, 0, 2, 15 };
882 results = (words >> shifts) & masks;
883 results.store_unaligned(out);
884 out += 4;
885
886 // extract 13-bit bundles 12 to 15
887 words = simd_batch{ SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6) };
888 shifts = simd_batch{ 0, 9, 0, 3 };
889 results = (words >> shifts) & masks;
890 results.store_unaligned(out);
891 out += 4;
892
893 // extract 13-bit bundles 16 to 19
894 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9 };
895 shifts = simd_batch{ 16, 0, 10, 0 };
896 results = (words >> shifts) & masks;
897 results.store_unaligned(out);
898 out += 4;
899
900 // extract 13-bit bundles 20 to 23
901 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) };
902 shifts = simd_batch{ 4, 17, 0, 11 };
903 results = (words >> shifts) & masks;
904 results.store_unaligned(out);
905 out += 4;
906
907 // extract 13-bit bundles 24 to 27
908 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1 };
909 shifts = simd_batch{ 0, 5, 18, 0 };
910 results = (words >> shifts) & masks;
911 results.store_unaligned(out);
912 out += 4;
913
914 // extract 13-bit bundles 28 to 31
915 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) };
916 shifts = simd_batch{ 12, 0, 6, 19 };
917 results = (words >> shifts) & masks;
918 results.store_unaligned(out);
919 out += 4;
920
921 in += 13;
922 return in;
923}
924
925inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
926 uint32_t mask = 0x3fff;
927
928 simd_batch masks(mask);
929 simd_batch words, shifts;
930 simd_batch results;
931
932 // extract 14-bit bundles 0 to 3
933 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1) };
934 shifts = simd_batch{ 0, 14, 0, 10 };
935 results = (words >> shifts) & masks;
936 results.store_unaligned(out);
937 out += 4;
938
939 // extract 14-bit bundles 4 to 7
940 words = simd_batch{ SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3) };
941 shifts = simd_batch{ 0, 6, 0, 2 };
942 results = (words >> shifts) & masks;
943 results.store_unaligned(out);
944 out += 4;
945
946 // extract 14-bit bundles 8 to 11
947 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6 };
948 shifts = simd_batch{ 16, 0, 12, 0 };
949 results = (words >> shifts) & masks;
950 results.store_unaligned(out);
951 out += 4;
952
953 // extract 14-bit bundles 12 to 15
954 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) };
955 shifts = simd_batch{ 8, 0, 4, 18 };
956 results = (words >> shifts) & masks;
957 results.store_unaligned(out);
958 out += 4;
959
960 // extract 14-bit bundles 16 to 19
961 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) };
962 shifts = simd_batch{ 0, 14, 0, 10 };
963 results = (words >> shifts) & masks;
964 results.store_unaligned(out);
965 out += 4;
966
967 // extract 14-bit bundles 20 to 23
968 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) };
969 shifts = simd_batch{ 0, 6, 0, 2 };
970 results = (words >> shifts) & masks;
971 results.store_unaligned(out);
972 out += 4;
973
974 // extract 14-bit bundles 24 to 27
975 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6 };
976 shifts = simd_batch{ 16, 0, 12, 0 };
977 results = (words >> shifts) & masks;
978 results.store_unaligned(out);
979 out += 4;
980
981 // extract 14-bit bundles 28 to 31
982 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) };
983 shifts = simd_batch{ 8, 0, 4, 18 };
984 results = (words >> shifts) & masks;
985 results.store_unaligned(out);
986 out += 4;
987
988 in += 14;
989 return in;
990}
991
992inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
993 uint32_t mask = 0x7fff;
994
995 simd_batch masks(mask);
996 simd_batch words, shifts;
997 simd_batch results;
998
999 // extract 15-bit bundles 0 to 3
1000 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) };
1001 shifts = simd_batch{ 0, 15, 0, 13 };
1002 results = (words >> shifts) & masks;
1003 results.store_unaligned(out);
1004 out += 4;
1005
1006 // extract 15-bit bundles 4 to 7
1007 words = simd_batch{ SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3) };
1008 shifts = simd_batch{ 0, 11, 0, 9 };
1009 results = (words >> shifts) & masks;
1010 results.store_unaligned(out);
1011 out += 4;
1012
1013 // extract 15-bit bundles 8 to 11
1014 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) };
1015 shifts = simd_batch{ 0, 7, 0, 5 };
1016 results = (words >> shifts) & masks;
1017 results.store_unaligned(out);
1018 out += 4;
1019
1020 // extract 15-bit bundles 12 to 15
1021 words = simd_batch{ SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7) };
1022 shifts = simd_batch{ 0, 3, 0, 1 };
1023 results = (words >> shifts) & masks;
1024 results.store_unaligned(out);
1025 out += 4;
1026
1027 // extract 15-bit bundles 16 to 19
1028 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3 };
1029 shifts = simd_batch{ 16, 0, 14, 0 };
1030 results = (words >> shifts) & masks;
1031 results.store_unaligned(out);
1032 out += 4;
1033
1034 // extract 15-bit bundles 20 to 23
1035 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7 };
1036 shifts = simd_batch{ 12, 0, 10, 0 };
1037 results = (words >> shifts) & masks;
1038 results.store_unaligned(out);
1039 out += 4;
1040
1041 // extract 15-bit bundles 24 to 27
1042 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11 };
1043 shifts = simd_batch{ 8, 0, 6, 0 };
1044 results = (words >> shifts) & masks;
1045 results.store_unaligned(out);
1046 out += 4;
1047
1048 // extract 15-bit bundles 28 to 31
1049 words = simd_batch{ SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) };
1050 shifts = simd_batch{ 4, 0, 2, 17 };
1051 results = (words >> shifts) & masks;
1052 results.store_unaligned(out);
1053 out += 4;
1054
1055 in += 15;
1056 return in;
1057}
1058
1059inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
1060 uint32_t mask = 0xffff;
1061
1062 simd_batch masks(mask);
1063 simd_batch words, shifts;
1064 simd_batch results;
1065
1066 // extract 16-bit bundles 0 to 3
1067 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) };
1068 shifts = simd_batch{ 0, 16, 0, 16 };
1069 results = (words >> shifts) & masks;
1070 results.store_unaligned(out);
1071 out += 4;
1072
1073 // extract 16-bit bundles 4 to 7
1074 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) };
1075 shifts = simd_batch{ 0, 16, 0, 16 };
1076 results = (words >> shifts) & masks;
1077 results.store_unaligned(out);
1078 out += 4;
1079
1080 // extract 16-bit bundles 8 to 11
1081 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) };
1082 shifts = simd_batch{ 0, 16, 0, 16 };
1083 results = (words >> shifts) & masks;
1084 results.store_unaligned(out);
1085 out += 4;
1086
1087 // extract 16-bit bundles 12 to 15
1088 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) };
1089 shifts = simd_batch{ 0, 16, 0, 16 };
1090 results = (words >> shifts) & masks;
1091 results.store_unaligned(out);
1092 out += 4;
1093
1094 // extract 16-bit bundles 16 to 19
1095 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) };
1096 shifts = simd_batch{ 0, 16, 0, 16 };
1097 results = (words >> shifts) & masks;
1098 results.store_unaligned(out);
1099 out += 4;
1100
1101 // extract 16-bit bundles 20 to 23
1102 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) };
1103 shifts = simd_batch{ 0, 16, 0, 16 };
1104 results = (words >> shifts) & masks;
1105 results.store_unaligned(out);
1106 out += 4;
1107
1108 // extract 16-bit bundles 24 to 27
1109 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) };
1110 shifts = simd_batch{ 0, 16, 0, 16 };
1111 results = (words >> shifts) & masks;
1112 results.store_unaligned(out);
1113 out += 4;
1114
1115 // extract 16-bit bundles 28 to 31
1116 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) };
1117 shifts = simd_batch{ 0, 16, 0, 16 };
1118 results = (words >> shifts) & masks;
1119 results.store_unaligned(out);
1120 out += 4;
1121
1122 in += 16;
1123 return in;
1124}
1125
1126inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
1127 uint32_t mask = 0x1ffff;
1128
1129 simd_batch masks(mask);
1130 simd_batch words, shifts;
1131 simd_batch results;
1132
1133 // extract 17-bit bundles 0 to 3
1134 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 17 | SafeLoad<uint32_t>(in + 1) << 15, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 19 | SafeLoad<uint32_t>(in + 2) << 13 };
1135 shifts = simd_batch{ 0, 0, 2, 0 };
1136 results = (words >> shifts) & masks;
1137 results.store_unaligned(out);
1138 out += 4;
1139
1140 // extract 17-bit bundles 4 to 7
1141 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 21 | SafeLoad<uint32_t>(in + 3) << 11, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 23 | SafeLoad<uint32_t>(in + 4) << 9 };
1142 shifts = simd_batch{ 4, 0, 6, 0 };
1143 results = (words >> shifts) & masks;
1144 results.store_unaligned(out);
1145 out += 4;
1146
1147 // extract 17-bit bundles 8 to 11
1148 words = simd_batch{ SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 25 | SafeLoad<uint32_t>(in + 5) << 7, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 27 | SafeLoad<uint32_t>(in + 6) << 5 };
1149 shifts = simd_batch{ 8, 0, 10, 0 };
1150 results = (words >> shifts) & masks;
1151 results.store_unaligned(out);
1152 out += 4;
1153
1154 // extract 17-bit bundles 12 to 15
1155 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 29 | SafeLoad<uint32_t>(in + 7) << 3, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 31 | SafeLoad<uint32_t>(in + 8) << 1 };
1156 shifts = simd_batch{ 12, 0, 14, 0 };
1157 results = (words >> shifts) & masks;
1158 results.store_unaligned(out);
1159 out += 4;
1160
1161 // extract 17-bit bundles 16 to 19
1162 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 16 | SafeLoad<uint32_t>(in + 9) << 16, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10) };
1163 shifts = simd_batch{ 0, 1, 0, 3 };
1164 results = (words >> shifts) & masks;
1165 results.store_unaligned(out);
1166 out += 4;
1167
1168 // extract 17-bit bundles 20 to 23
1169 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12) };
1170 shifts = simd_batch{ 0, 5, 0, 7 };
1171 results = (words >> shifts) & masks;
1172 results.store_unaligned(out);
1173 out += 4;
1174
1175 // extract 17-bit bundles 24 to 27
1176 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14) };
1177 shifts = simd_batch{ 0, 9, 0, 11 };
1178 results = (words >> shifts) & masks;
1179 results.store_unaligned(out);
1180 out += 4;
1181
1182 // extract 17-bit bundles 28 to 31
1183 words = simd_batch{ SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) };
1184 shifts = simd_batch{ 0, 13, 0, 15 };
1185 results = (words >> shifts) & masks;
1186 results.store_unaligned(out);
1187 out += 4;
1188
1189 in += 17;
1190 return in;
1191}
1192
1193inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
1194 uint32_t mask = 0x3ffff;
1195
1196 simd_batch masks(mask);
1197 simd_batch words, shifts;
1198 simd_batch results;
1199
1200 // extract 18-bit bundles 0 to 3
1201 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 18 | SafeLoad<uint32_t>(in + 1) << 14, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10 };
1202 shifts = simd_batch{ 0, 0, 4, 0 };
1203 results = (words >> shifts) & masks;
1204 results.store_unaligned(out);
1205 out += 4;
1206
1207 // extract 18-bit bundles 4 to 7
1208 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2 };
1209 shifts = simd_batch{ 8, 0, 12, 0 };
1210 results = (words >> shifts) & masks;
1211 results.store_unaligned(out);
1212 out += 4;
1213
1214 // extract 18-bit bundles 8 to 11
1215 words = simd_batch{ SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6) };
1216 shifts = simd_batch{ 0, 2, 0, 6 };
1217 results = (words >> shifts) & masks;
1218 results.store_unaligned(out);
1219 out += 4;
1220
1221 // extract 18-bit bundles 12 to 15
1222 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) };
1223 shifts = simd_batch{ 0, 10, 0, 14 };
1224 results = (words >> shifts) & masks;
1225 results.store_unaligned(out);
1226 out += 4;
1227
1228 // extract 18-bit bundles 16 to 19
1229 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 18 | SafeLoad<uint32_t>(in + 10) << 14, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10 };
1230 shifts = simd_batch{ 0, 0, 4, 0 };
1231 results = (words >> shifts) & masks;
1232 results.store_unaligned(out);
1233 out += 4;
1234
1235 // extract 18-bit bundles 20 to 23
1236 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2 };
1237 shifts = simd_batch{ 8, 0, 12, 0 };
1238 results = (words >> shifts) & masks;
1239 results.store_unaligned(out);
1240 out += 4;
1241
1242 // extract 18-bit bundles 24 to 27
1243 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15) };
1244 shifts = simd_batch{ 0, 2, 0, 6 };
1245 results = (words >> shifts) & masks;
1246 results.store_unaligned(out);
1247 out += 4;
1248
1249 // extract 18-bit bundles 28 to 31
1250 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) };
1251 shifts = simd_batch{ 0, 10, 0, 14 };
1252 results = (words >> shifts) & masks;
1253 results.store_unaligned(out);
1254 out += 4;
1255
1256 in += 18;
1257 return in;
1258}
1259
1260inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
1261 uint32_t mask = 0x7ffff;
1262
1263 simd_batch masks(mask);
1264 simd_batch words, shifts;
1265 simd_batch results;
1266
1267 // extract 19-bit bundles 0 to 3
1268 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 19 | SafeLoad<uint32_t>(in + 1) << 13, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 25 | SafeLoad<uint32_t>(in + 2) << 7 };
1269 shifts = simd_batch{ 0, 0, 6, 0 };
1270 results = (words >> shifts) & masks;
1271 results.store_unaligned(out);
1272 out += 4;
1273
1274 // extract 19-bit bundles 4 to 7
1275 words = simd_batch{ SafeLoad<uint32_t>(in + 2), SafeLoad<uint32_t>(in + 2) >> 31 | SafeLoad<uint32_t>(in + 3) << 1, SafeLoad<uint32_t>(in + 3) >> 18 | SafeLoad<uint32_t>(in + 4) << 14, SafeLoad<uint32_t>(in + 4) };
1276 shifts = simd_batch{ 12, 0, 0, 5 };
1277 results = (words >> shifts) & masks;
1278 results.store_unaligned(out);
1279 out += 4;
1280
1281 // extract 19-bit bundles 8 to 11
1282 words = simd_batch{ SafeLoad<uint32_t>(in + 4) >> 24 | SafeLoad<uint32_t>(in + 5) << 8, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 30 | SafeLoad<uint32_t>(in + 6) << 2, SafeLoad<uint32_t>(in + 6) >> 17 | SafeLoad<uint32_t>(in + 7) << 15 };
1283 shifts = simd_batch{ 0, 11, 0, 0 };
1284 results = (words >> shifts) & masks;
1285 results.store_unaligned(out);
1286 out += 4;
1287
1288 // extract 19-bit bundles 12 to 15
1289 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 23 | SafeLoad<uint32_t>(in + 8) << 9, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 29 | SafeLoad<uint32_t>(in + 9) << 3 };
1290 shifts = simd_batch{ 4, 0, 10, 0 };
1291 results = (words >> shifts) & masks;
1292 results.store_unaligned(out);
1293 out += 4;
1294
1295 // extract 19-bit bundles 16 to 19
1296 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 16 | SafeLoad<uint32_t>(in + 10) << 16, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 22 | SafeLoad<uint32_t>(in + 11) << 10, SafeLoad<uint32_t>(in + 11) };
1297 shifts = simd_batch{ 0, 3, 0, 9 };
1298 results = (words >> shifts) & masks;
1299 results.store_unaligned(out);
1300 out += 4;
1301
1302 // extract 19-bit bundles 20 to 23
1303 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11 };
1304 shifts = simd_batch{ 0, 0, 2, 0 };
1305 results = (words >> shifts) & masks;
1306 results.store_unaligned(out);
1307 out += 4;
1308
1309 // extract 19-bit bundles 24 to 27
1310 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18, SafeLoad<uint32_t>(in + 16) };
1311 shifts = simd_batch{ 8, 0, 0, 1 };
1312 results = (words >> shifts) & masks;
1313 results.store_unaligned(out);
1314 out += 4;
1315
1316 // extract 19-bit bundles 28 to 31
1317 words = simd_batch{ SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6, SafeLoad<uint32_t>(in + 18) };
1318 shifts = simd_batch{ 0, 7, 0, 13 };
1319 results = (words >> shifts) & masks;
1320 results.store_unaligned(out);
1321 out += 4;
1322
1323 in += 19;
1324 return in;
1325}
1326
1327inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
1328 uint32_t mask = 0xfffff;
1329
1330 simd_batch masks(mask);
1331 simd_batch words, shifts;
1332 simd_batch results;
1333
1334 // extract 20-bit bundles 0 to 3
1335 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 20 | SafeLoad<uint32_t>(in + 1) << 12, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4 };
1336 shifts = simd_batch{ 0, 0, 8, 0 };
1337 results = (words >> shifts) & masks;
1338 results.store_unaligned(out);
1339 out += 4;
1340
1341 // extract 20-bit bundles 4 to 7
1342 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 16 | SafeLoad<uint32_t>(in + 3) << 16, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) };
1343 shifts = simd_batch{ 0, 4, 0, 12 };
1344 results = (words >> shifts) & masks;
1345 results.store_unaligned(out);
1346 out += 4;
1347
1348 // extract 20-bit bundles 8 to 11
1349 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4 };
1350 shifts = simd_batch{ 0, 0, 8, 0 };
1351 results = (words >> shifts) & masks;
1352 results.store_unaligned(out);
1353 out += 4;
1354
1355 // extract 20-bit bundles 12 to 15
1356 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) };
1357 shifts = simd_batch{ 0, 4, 0, 12 };
1358 results = (words >> shifts) & masks;
1359 results.store_unaligned(out);
1360 out += 4;
1361
1362 // extract 20-bit bundles 16 to 19
1363 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 20 | SafeLoad<uint32_t>(in + 11) << 12, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 28 | SafeLoad<uint32_t>(in + 12) << 4 };
1364 shifts = simd_batch{ 0, 0, 8, 0 };
1365 results = (words >> shifts) & masks;
1366 results.store_unaligned(out);
1367 out += 4;
1368
1369 // extract 20-bit bundles 20 to 23
1370 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14) };
1371 shifts = simd_batch{ 0, 4, 0, 12 };
1372 results = (words >> shifts) & masks;
1373 results.store_unaligned(out);
1374 out += 4;
1375
1376 // extract 20-bit bundles 24 to 27
1377 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16), SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4 };
1378 shifts = simd_batch{ 0, 0, 8, 0 };
1379 results = (words >> shifts) & masks;
1380 results.store_unaligned(out);
1381 out += 4;
1382
1383 // extract 20-bit bundles 28 to 31
1384 words = simd_batch{ SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) };
1385 shifts = simd_batch{ 0, 4, 0, 12 };
1386 results = (words >> shifts) & masks;
1387 results.store_unaligned(out);
1388 out += 4;
1389
1390 in += 20;
1391 return in;
1392}
1393
1394inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
1395 uint32_t mask = 0x1fffff;
1396
1397 simd_batch masks(mask);
1398 simd_batch words, shifts;
1399 simd_batch results;
1400
1401 // extract 21-bit bundles 0 to 3
1402 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 21 | SafeLoad<uint32_t>(in + 1) << 11, SafeLoad<uint32_t>(in + 1), SafeLoad<uint32_t>(in + 1) >> 31 | SafeLoad<uint32_t>(in + 2) << 1 };
1403 shifts = simd_batch{ 0, 0, 10, 0 };
1404 results = (words >> shifts) & masks;
1405 results.store_unaligned(out);
1406 out += 4;
1407
1408 // extract 21-bit bundles 4 to 7
1409 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12, SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 30 | SafeLoad<uint32_t>(in + 4) << 2, SafeLoad<uint32_t>(in + 4) >> 19 | SafeLoad<uint32_t>(in + 5) << 13 };
1410 shifts = simd_batch{ 0, 9, 0, 0 };
1411 results = (words >> shifts) & masks;
1412 results.store_unaligned(out);
1413 out += 4;
1414
1415 // extract 21-bit bundles 8 to 11
1416 words = simd_batch{ SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14, SafeLoad<uint32_t>(in + 7) };
1417 shifts = simd_batch{ 8, 0, 0, 7 };
1418 results = (words >> shifts) & masks;
1419 results.store_unaligned(out);
1420 out += 4;
1421
1422 // extract 21-bit bundles 12 to 15
1423 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 17 | SafeLoad<uint32_t>(in + 9) << 15, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 27 | SafeLoad<uint32_t>(in + 10) << 5 };
1424 shifts = simd_batch{ 0, 0, 6, 0 };
1425 results = (words >> shifts) & masks;
1426 results.store_unaligned(out);
1427 out += 4;
1428
1429 // extract 21-bit bundles 16 to 19
1430 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 15 | SafeLoad<uint32_t>(in + 13) << 17 };
1431 shifts = simd_batch{ 0, 5, 0, 0 };
1432 results = (words >> shifts) & masks;
1433 results.store_unaligned(out);
1434 out += 4;
1435
1436 // extract 21-bit bundles 20 to 23
1437 words = simd_batch{ SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 25 | SafeLoad<uint32_t>(in + 14) << 7, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15) };
1438 shifts = simd_batch{ 4, 0, 0, 3 };
1439 results = (words >> shifts) & masks;
1440 results.store_unaligned(out);
1441 out += 4;
1442
1443 // extract 21-bit bundles 24 to 27
1444 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9 };
1445 shifts = simd_batch{ 0, 0, 2, 0 };
1446 results = (words >> shifts) & masks;
1447 results.store_unaligned(out);
1448 out += 4;
1449
1450 // extract 21-bit bundles 28 to 31
1451 words = simd_batch{ SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) };
1452 shifts = simd_batch{ 0, 1, 0, 11 };
1453 results = (words >> shifts) & masks;
1454 results.store_unaligned(out);
1455 out += 4;
1456
1457 in += 21;
1458 return in;
1459}
1460
1461inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
1462 uint32_t mask = 0x3fffff;
1463
1464 simd_batch masks(mask);
1465 simd_batch words, shifts;
1466 simd_batch results;
1467
1468 // extract 22-bit bundles 0 to 3
1469 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 22 | SafeLoad<uint32_t>(in + 1) << 10, SafeLoad<uint32_t>(in + 1) >> 12 | SafeLoad<uint32_t>(in + 2) << 20, SafeLoad<uint32_t>(in + 2) };
1470 shifts = simd_batch{ 0, 0, 0, 2 };
1471 results = (words >> shifts) & masks;
1472 results.store_unaligned(out);
1473 out += 4;
1474
1475 // extract 22-bit bundles 4 to 7
1476 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 24 | SafeLoad<uint32_t>(in + 3) << 8, SafeLoad<uint32_t>(in + 3) >> 14 | SafeLoad<uint32_t>(in + 4) << 18, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 26 | SafeLoad<uint32_t>(in + 5) << 6 };
1477 shifts = simd_batch{ 0, 0, 4, 0 };
1478 results = (words >> shifts) & masks;
1479 results.store_unaligned(out);
1480 out += 4;
1481
1482 // extract 22-bit bundles 8 to 11
1483 words = simd_batch{ SafeLoad<uint32_t>(in + 5) >> 16 | SafeLoad<uint32_t>(in + 6) << 16, SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 28 | SafeLoad<uint32_t>(in + 7) << 4, SafeLoad<uint32_t>(in + 7) >> 18 | SafeLoad<uint32_t>(in + 8) << 14 };
1484 shifts = simd_batch{ 0, 6, 0, 0 };
1485 results = (words >> shifts) & masks;
1486 results.store_unaligned(out);
1487 out += 4;
1488
1489 // extract 22-bit bundles 12 to 15
1490 words = simd_batch{ SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12, SafeLoad<uint32_t>(in + 10) };
1491 shifts = simd_batch{ 8, 0, 0, 10 };
1492 results = (words >> shifts) & masks;
1493 results.store_unaligned(out);
1494 out += 4;
1495
1496 // extract 22-bit bundles 16 to 19
1497 words = simd_batch{ SafeLoad<uint32_t>(in + 11), SafeLoad<uint32_t>(in + 11) >> 22 | SafeLoad<uint32_t>(in + 12) << 10, SafeLoad<uint32_t>(in + 12) >> 12 | SafeLoad<uint32_t>(in + 13) << 20, SafeLoad<uint32_t>(in + 13) };
1498 shifts = simd_batch{ 0, 0, 0, 2 };
1499 results = (words >> shifts) & masks;
1500 results.store_unaligned(out);
1501 out += 4;
1502
1503 // extract 22-bit bundles 20 to 23
1504 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 24 | SafeLoad<uint32_t>(in + 14) << 8, SafeLoad<uint32_t>(in + 14) >> 14 | SafeLoad<uint32_t>(in + 15) << 18, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6 };
1505 shifts = simd_batch{ 0, 0, 4, 0 };
1506 results = (words >> shifts) & masks;
1507 results.store_unaligned(out);
1508 out += 4;
1509
1510 // extract 22-bit bundles 24 to 27
1511 words = simd_batch{ SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14 };
1512 shifts = simd_batch{ 0, 6, 0, 0 };
1513 results = (words >> shifts) & masks;
1514 results.store_unaligned(out);
1515 out += 4;
1516
1517 // extract 22-bit bundles 28 to 31
1518 words = simd_batch{ SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) };
1519 shifts = simd_batch{ 8, 0, 0, 10 };
1520 results = (words >> shifts) & masks;
1521 results.store_unaligned(out);
1522 out += 4;
1523
1524 in += 22;
1525 return in;
1526}
1527
1528inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
1529 uint32_t mask = 0x7fffff;
1530
1531 simd_batch masks(mask);
1532 simd_batch words, shifts;
1533 simd_batch results;
1534
1535 // extract 23-bit bundles 0 to 3
1536 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 23 | SafeLoad<uint32_t>(in + 1) << 9, SafeLoad<uint32_t>(in + 1) >> 14 | SafeLoad<uint32_t>(in + 2) << 18, SafeLoad<uint32_t>(in + 2) };
1537 shifts = simd_batch{ 0, 0, 0, 5 };
1538 results = (words >> shifts) & masks;
1539 results.store_unaligned(out);
1540 out += 4;
1541
1542 // extract 23-bit bundles 4 to 7
1543 words = simd_batch{ SafeLoad<uint32_t>(in + 2) >> 28 | SafeLoad<uint32_t>(in + 3) << 4, SafeLoad<uint32_t>(in + 3) >> 19 | SafeLoad<uint32_t>(in + 4) << 13, SafeLoad<uint32_t>(in + 4) >> 10 | SafeLoad<uint32_t>(in + 5) << 22, SafeLoad<uint32_t>(in + 5) };
1544 shifts = simd_batch{ 0, 0, 0, 1 };
1545 results = (words >> shifts) & masks;
1546 results.store_unaligned(out);
1547 out += 4;
1548
1549 // extract 23-bit bundles 8 to 11
1550 words = simd_batch{ SafeLoad<uint32_t>(in + 5) >> 24 | SafeLoad<uint32_t>(in + 6) << 8, SafeLoad<uint32_t>(in + 6) >> 15 | SafeLoad<uint32_t>(in + 7) << 17, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 29 | SafeLoad<uint32_t>(in + 8) << 3 };
1551 shifts = simd_batch{ 0, 0, 6, 0 };
1552 results = (words >> shifts) & masks;
1553 results.store_unaligned(out);
1554 out += 4;
1555
1556 // extract 23-bit bundles 12 to 15
1557 words = simd_batch{ SafeLoad<uint32_t>(in + 8) >> 20 | SafeLoad<uint32_t>(in + 9) << 12, SafeLoad<uint32_t>(in + 9) >> 11 | SafeLoad<uint32_t>(in + 10) << 21, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 25 | SafeLoad<uint32_t>(in + 11) << 7 };
1558 shifts = simd_batch{ 0, 0, 2, 0 };
1559 results = (words >> shifts) & masks;
1560 results.store_unaligned(out);
1561 out += 4;
1562
1563 // extract 23-bit bundles 16 to 19
1564 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 16 | SafeLoad<uint32_t>(in + 12) << 16, SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 30 | SafeLoad<uint32_t>(in + 13) << 2, SafeLoad<uint32_t>(in + 13) >> 21 | SafeLoad<uint32_t>(in + 14) << 11 };
1565 shifts = simd_batch{ 0, 7, 0, 0 };
1566 results = (words >> shifts) & masks;
1567 results.store_unaligned(out);
1568 out += 4;
1569
1570 // extract 23-bit bundles 20 to 23
1571 words = simd_batch{ SafeLoad<uint32_t>(in + 14) >> 12 | SafeLoad<uint32_t>(in + 15) << 20, SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 26 | SafeLoad<uint32_t>(in + 16) << 6, SafeLoad<uint32_t>(in + 16) >> 17 | SafeLoad<uint32_t>(in + 17) << 15 };
1572 shifts = simd_batch{ 0, 3, 0, 0 };
1573 results = (words >> shifts) & masks;
1574 results.store_unaligned(out);
1575 out += 4;
1576
1577 // extract 23-bit bundles 24 to 27
1578 words = simd_batch{ SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19 };
1579 shifts = simd_batch{ 8, 0, 0, 0 };
1580 results = (words >> shifts) & masks;
1581 results.store_unaligned(out);
1582 out += 4;
1583
1584 // extract 23-bit bundles 28 to 31
1585 words = simd_batch{ SafeLoad<uint32_t>(in + 20), SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14, SafeLoad<uint32_t>(in + 22) };
1586 shifts = simd_batch{ 4, 0, 0, 9 };
1587 results = (words >> shifts) & masks;
1588 results.store_unaligned(out);
1589 out += 4;
1590
1591 in += 23;
1592 return in;
1593}
1594
1595inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
1596 uint32_t mask = 0xffffff;
1597
1598 simd_batch masks(mask);
1599 simd_batch words, shifts;
1600 simd_batch results;
1601
1602 // extract 24-bit bundles 0 to 3
1603 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 24 | SafeLoad<uint32_t>(in + 1) << 8, SafeLoad<uint32_t>(in + 1) >> 16 | SafeLoad<uint32_t>(in + 2) << 16, SafeLoad<uint32_t>(in + 2) };
1604 shifts = simd_batch{ 0, 0, 0, 8 };
1605 results = (words >> shifts) & masks;
1606 results.store_unaligned(out);
1607 out += 4;
1608
1609 // extract 24-bit bundles 4 to 7
1610 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 16 | SafeLoad<uint32_t>(in + 5) << 16, SafeLoad<uint32_t>(in + 5) };
1611 shifts = simd_batch{ 0, 0, 0, 8 };
1612 results = (words >> shifts) & masks;
1613 results.store_unaligned(out);
1614 out += 4;
1615
1616 // extract 24-bit bundles 8 to 11
1617 words = simd_batch{ SafeLoad<uint32_t>(in + 6), SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8) };
1618 shifts = simd_batch{ 0, 0, 0, 8 };
1619 results = (words >> shifts) & masks;
1620 results.store_unaligned(out);
1621 out += 4;
1622
1623 // extract 24-bit bundles 12 to 15
1624 words = simd_batch{ SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) };
1625 shifts = simd_batch{ 0, 0, 0, 8 };
1626 results = (words >> shifts) & masks;
1627 results.store_unaligned(out);
1628 out += 4;
1629
1630 // extract 24-bit bundles 16 to 19
1631 words = simd_batch{ SafeLoad<uint32_t>(in + 12), SafeLoad<uint32_t>(in + 12) >> 24 | SafeLoad<uint32_t>(in + 13) << 8, SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14) };
1632 shifts = simd_batch{ 0, 0, 0, 8 };
1633 results = (words >> shifts) & masks;
1634 results.store_unaligned(out);
1635 out += 4;
1636
1637 // extract 24-bit bundles 20 to 23
1638 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 16 | SafeLoad<uint32_t>(in + 17) << 16, SafeLoad<uint32_t>(in + 17) };
1639 shifts = simd_batch{ 0, 0, 0, 8 };
1640 results = (words >> shifts) & masks;
1641 results.store_unaligned(out);
1642 out += 4;
1643
1644 // extract 24-bit bundles 24 to 27
1645 words = simd_batch{ SafeLoad<uint32_t>(in + 18), SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20) };
1646 shifts = simd_batch{ 0, 0, 0, 8 };
1647 results = (words >> shifts) & masks;
1648 results.store_unaligned(out);
1649 out += 4;
1650
1651 // extract 24-bit bundles 28 to 31
1652 words = simd_batch{ SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) };
1653 shifts = simd_batch{ 0, 0, 0, 8 };
1654 results = (words >> shifts) & masks;
1655 results.store_unaligned(out);
1656 out += 4;
1657
1658 in += 24;
1659 return in;
1660}
1661
1662inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
1663 uint32_t mask = 0x1ffffff;
1664
1665 simd_batch masks(mask);
1666 simd_batch words, shifts;
1667 simd_batch results;
1668
1669 // extract 25-bit bundles 0 to 3
1670 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 25 | SafeLoad<uint32_t>(in + 1) << 7, SafeLoad<uint32_t>(in + 1) >> 18 | SafeLoad<uint32_t>(in + 2) << 14, SafeLoad<uint32_t>(in + 2) >> 11 | SafeLoad<uint32_t>(in + 3) << 21 };
1671 shifts = simd_batch{ 0, 0, 0, 0 };
1672 results = (words >> shifts) & masks;
1673 results.store_unaligned(out);
1674 out += 4;
1675
1676 // extract 25-bit bundles 4 to 7
1677 words = simd_batch{ SafeLoad<uint32_t>(in + 3), SafeLoad<uint32_t>(in + 3) >> 29 | SafeLoad<uint32_t>(in + 4) << 3, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 15 | SafeLoad<uint32_t>(in + 6) << 17 };
1678 shifts = simd_batch{ 4, 0, 0, 0 };
1679 results = (words >> shifts) & masks;
1680 results.store_unaligned(out);
1681 out += 4;
1682
1683 // extract 25-bit bundles 8 to 11
1684 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 8 | SafeLoad<uint32_t>(in + 7) << 24, SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 26 | SafeLoad<uint32_t>(in + 8) << 6, SafeLoad<uint32_t>(in + 8) >> 19 | SafeLoad<uint32_t>(in + 9) << 13 };
1685 shifts = simd_batch{ 0, 1, 0, 0 };
1686 results = (words >> shifts) & masks;
1687 results.store_unaligned(out);
1688 out += 4;
1689
1690 // extract 25-bit bundles 12 to 15
1691 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 30 | SafeLoad<uint32_t>(in + 11) << 2, SafeLoad<uint32_t>(in + 11) >> 23 | SafeLoad<uint32_t>(in + 12) << 9 };
1692 shifts = simd_batch{ 0, 5, 0, 0 };
1693 results = (words >> shifts) & masks;
1694 results.store_unaligned(out);
1695 out += 4;
1696
1697 // extract 25-bit bundles 16 to 19
1698 words = simd_batch{ SafeLoad<uint32_t>(in + 12) >> 16 | SafeLoad<uint32_t>(in + 13) << 16, SafeLoad<uint32_t>(in + 13) >> 9 | SafeLoad<uint32_t>(in + 14) << 23, SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 27 | SafeLoad<uint32_t>(in + 15) << 5 };
1699 shifts = simd_batch{ 0, 0, 2, 0 };
1700 results = (words >> shifts) & masks;
1701 results.store_unaligned(out);
1702 out += 4;
1703
1704 // extract 25-bit bundles 20 to 23
1705 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 20 | SafeLoad<uint32_t>(in + 16) << 12, SafeLoad<uint32_t>(in + 16) >> 13 | SafeLoad<uint32_t>(in + 17) << 19, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 31 | SafeLoad<uint32_t>(in + 18) << 1 };
1706 shifts = simd_batch{ 0, 0, 6, 0 };
1707 results = (words >> shifts) & masks;
1708 results.store_unaligned(out);
1709 out += 4;
1710
1711 // extract 25-bit bundles 24 to 27
1712 words = simd_batch{ SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 17 | SafeLoad<uint32_t>(in + 20) << 15, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21) };
1713 shifts = simd_batch{ 0, 0, 0, 3 };
1714 results = (words >> shifts) & masks;
1715 results.store_unaligned(out);
1716 out += 4;
1717
1718 // extract 25-bit bundles 28 to 31
1719 words = simd_batch{ SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) };
1720 shifts = simd_batch{ 0, 0, 0, 7 };
1721 results = (words >> shifts) & masks;
1722 results.store_unaligned(out);
1723 out += 4;
1724
1725 in += 25;
1726 return in;
1727}
1728
1729inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
1730 uint32_t mask = 0x3ffffff;
1731
1732 simd_batch masks(mask);
1733 simd_batch words, shifts;
1734 simd_batch results;
1735
1736 // extract 26-bit bundles 0 to 3
1737 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 26 | SafeLoad<uint32_t>(in + 1) << 6, SafeLoad<uint32_t>(in + 1) >> 20 | SafeLoad<uint32_t>(in + 2) << 12, SafeLoad<uint32_t>(in + 2) >> 14 | SafeLoad<uint32_t>(in + 3) << 18 };
1738 shifts = simd_batch{ 0, 0, 0, 0 };
1739 results = (words >> shifts) & masks;
1740 results.store_unaligned(out);
1741 out += 4;
1742
1743 // extract 26-bit bundles 4 to 7
1744 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 8 | SafeLoad<uint32_t>(in + 4) << 24, SafeLoad<uint32_t>(in + 4), SafeLoad<uint32_t>(in + 4) >> 28 | SafeLoad<uint32_t>(in + 5) << 4, SafeLoad<uint32_t>(in + 5) >> 22 | SafeLoad<uint32_t>(in + 6) << 10 };
1745 shifts = simd_batch{ 0, 2, 0, 0 };
1746 results = (words >> shifts) & masks;
1747 results.store_unaligned(out);
1748 out += 4;
1749
1750 // extract 26-bit bundles 8 to 11
1751 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 16 | SafeLoad<uint32_t>(in + 7) << 16, SafeLoad<uint32_t>(in + 7) >> 10 | SafeLoad<uint32_t>(in + 8) << 22, SafeLoad<uint32_t>(in + 8), SafeLoad<uint32_t>(in + 8) >> 30 | SafeLoad<uint32_t>(in + 9) << 2 };
1752 shifts = simd_batch{ 0, 0, 4, 0 };
1753 results = (words >> shifts) & masks;
1754 results.store_unaligned(out);
1755 out += 4;
1756
1757 // extract 26-bit bundles 12 to 15
1758 words = simd_batch{ SafeLoad<uint32_t>(in + 9) >> 24 | SafeLoad<uint32_t>(in + 10) << 8, SafeLoad<uint32_t>(in + 10) >> 18 | SafeLoad<uint32_t>(in + 11) << 14, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) };
1759 shifts = simd_batch{ 0, 0, 0, 6 };
1760 results = (words >> shifts) & masks;
1761 results.store_unaligned(out);
1762 out += 4;
1763
1764 // extract 26-bit bundles 16 to 19
1765 words = simd_batch{ SafeLoad<uint32_t>(in + 13), SafeLoad<uint32_t>(in + 13) >> 26 | SafeLoad<uint32_t>(in + 14) << 6, SafeLoad<uint32_t>(in + 14) >> 20 | SafeLoad<uint32_t>(in + 15) << 12, SafeLoad<uint32_t>(in + 15) >> 14 | SafeLoad<uint32_t>(in + 16) << 18 };
1766 shifts = simd_batch{ 0, 0, 0, 0 };
1767 results = (words >> shifts) & masks;
1768 results.store_unaligned(out);
1769 out += 4;
1770
1771 // extract 26-bit bundles 20 to 23
1772 words = simd_batch{ SafeLoad<uint32_t>(in + 16) >> 8 | SafeLoad<uint32_t>(in + 17) << 24, SafeLoad<uint32_t>(in + 17), SafeLoad<uint32_t>(in + 17) >> 28 | SafeLoad<uint32_t>(in + 18) << 4, SafeLoad<uint32_t>(in + 18) >> 22 | SafeLoad<uint32_t>(in + 19) << 10 };
1773 shifts = simd_batch{ 0, 2, 0, 0 };
1774 results = (words >> shifts) & masks;
1775 results.store_unaligned(out);
1776 out += 4;
1777
1778 // extract 26-bit bundles 24 to 27
1779 words = simd_batch{ SafeLoad<uint32_t>(in + 19) >> 16 | SafeLoad<uint32_t>(in + 20) << 16, SafeLoad<uint32_t>(in + 20) >> 10 | SafeLoad<uint32_t>(in + 21) << 22, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2 };
1780 shifts = simd_batch{ 0, 0, 4, 0 };
1781 results = (words >> shifts) & masks;
1782 results.store_unaligned(out);
1783 out += 4;
1784
1785 // extract 26-bit bundles 28 to 31
1786 words = simd_batch{ SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) };
1787 shifts = simd_batch{ 0, 0, 0, 6 };
1788 results = (words >> shifts) & masks;
1789 results.store_unaligned(out);
1790 out += 4;
1791
1792 in += 26;
1793 return in;
1794}
1795
1796inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
1797 uint32_t mask = 0x7ffffff;
1798
1799 simd_batch masks(mask);
1800 simd_batch words, shifts;
1801 simd_batch results;
1802
1803 // extract 27-bit bundles 0 to 3
1804 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 27 | SafeLoad<uint32_t>(in + 1) << 5, SafeLoad<uint32_t>(in + 1) >> 22 | SafeLoad<uint32_t>(in + 2) << 10, SafeLoad<uint32_t>(in + 2) >> 17 | SafeLoad<uint32_t>(in + 3) << 15 };
1805 shifts = simd_batch{ 0, 0, 0, 0 };
1806 results = (words >> shifts) & masks;
1807 results.store_unaligned(out);
1808 out += 4;
1809
1810 // extract 27-bit bundles 4 to 7
1811 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 12 | SafeLoad<uint32_t>(in + 4) << 20, SafeLoad<uint32_t>(in + 4) >> 7 | SafeLoad<uint32_t>(in + 5) << 25, SafeLoad<uint32_t>(in + 5), SafeLoad<uint32_t>(in + 5) >> 29 | SafeLoad<uint32_t>(in + 6) << 3 };
1812 shifts = simd_batch{ 0, 0, 2, 0 };
1813 results = (words >> shifts) & masks;
1814 results.store_unaligned(out);
1815 out += 4;
1816
1817 // extract 27-bit bundles 8 to 11
1818 words = simd_batch{ SafeLoad<uint32_t>(in + 6) >> 24 | SafeLoad<uint32_t>(in + 7) << 8, SafeLoad<uint32_t>(in + 7) >> 19 | SafeLoad<uint32_t>(in + 8) << 13, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 9 | SafeLoad<uint32_t>(in + 10) << 23 };
1819 shifts = simd_batch{ 0, 0, 0, 0 };
1820 results = (words >> shifts) & masks;
1821 results.store_unaligned(out);
1822 out += 4;
1823
1824 // extract 27-bit bundles 12 to 15
1825 words = simd_batch{ SafeLoad<uint32_t>(in + 10), SafeLoad<uint32_t>(in + 10) >> 31 | SafeLoad<uint32_t>(in + 11) << 1, SafeLoad<uint32_t>(in + 11) >> 26 | SafeLoad<uint32_t>(in + 12) << 6, SafeLoad<uint32_t>(in + 12) >> 21 | SafeLoad<uint32_t>(in + 13) << 11 };
1826 shifts = simd_batch{ 4, 0, 0, 0 };
1827 results = (words >> shifts) & masks;
1828 results.store_unaligned(out);
1829 out += 4;
1830
1831 // extract 27-bit bundles 16 to 19
1832 words = simd_batch{ SafeLoad<uint32_t>(in + 13) >> 16 | SafeLoad<uint32_t>(in + 14) << 16, SafeLoad<uint32_t>(in + 14) >> 11 | SafeLoad<uint32_t>(in + 15) << 21, SafeLoad<uint32_t>(in + 15) >> 6 | SafeLoad<uint32_t>(in + 16) << 26, SafeLoad<uint32_t>(in + 16) };
1833 shifts = simd_batch{ 0, 0, 0, 1 };
1834 results = (words >> shifts) & masks;
1835 results.store_unaligned(out);
1836 out += 4;
1837
1838 // extract 27-bit bundles 20 to 23
1839 words = simd_batch{ SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 23 | SafeLoad<uint32_t>(in + 18) << 9, SafeLoad<uint32_t>(in + 18) >> 18 | SafeLoad<uint32_t>(in + 19) << 14, SafeLoad<uint32_t>(in + 19) >> 13 | SafeLoad<uint32_t>(in + 20) << 19 };
1840 shifts = simd_batch{ 0, 0, 0, 0 };
1841 results = (words >> shifts) & masks;
1842 results.store_unaligned(out);
1843 out += 4;
1844
1845 // extract 27-bit bundles 24 to 27
1846 words = simd_batch{ SafeLoad<uint32_t>(in + 20) >> 8 | SafeLoad<uint32_t>(in + 21) << 24, SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 30 | SafeLoad<uint32_t>(in + 22) << 2, SafeLoad<uint32_t>(in + 22) >> 25 | SafeLoad<uint32_t>(in + 23) << 7 };
1847 shifts = simd_batch{ 0, 3, 0, 0 };
1848 results = (words >> shifts) & masks;
1849 results.store_unaligned(out);
1850 out += 4;
1851
1852 // extract 27-bit bundles 28 to 31
1853 words = simd_batch{ SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22, SafeLoad<uint32_t>(in + 26) };
1854 shifts = simd_batch{ 0, 0, 0, 5 };
1855 results = (words >> shifts) & masks;
1856 results.store_unaligned(out);
1857 out += 4;
1858
1859 in += 27;
1860 return in;
1861}
1862
1863inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
1864 uint32_t mask = 0xfffffff;
1865
1866 simd_batch masks(mask);
1867 simd_batch words, shifts;
1868 simd_batch results;
1869
1870 // extract 28-bit bundles 0 to 3
1871 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 28 | SafeLoad<uint32_t>(in + 1) << 4, SafeLoad<uint32_t>(in + 1) >> 24 | SafeLoad<uint32_t>(in + 2) << 8, SafeLoad<uint32_t>(in + 2) >> 20 | SafeLoad<uint32_t>(in + 3) << 12 };
1872 shifts = simd_batch{ 0, 0, 0, 0 };
1873 results = (words >> shifts) & masks;
1874 results.store_unaligned(out);
1875 out += 4;
1876
1877 // extract 28-bit bundles 4 to 7
1878 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 16 | SafeLoad<uint32_t>(in + 4) << 16, SafeLoad<uint32_t>(in + 4) >> 12 | SafeLoad<uint32_t>(in + 5) << 20, SafeLoad<uint32_t>(in + 5) >> 8 | SafeLoad<uint32_t>(in + 6) << 24, SafeLoad<uint32_t>(in + 6) };
1879 shifts = simd_batch{ 0, 0, 0, 4 };
1880 results = (words >> shifts) & masks;
1881 results.store_unaligned(out);
1882 out += 4;
1883
1884 // extract 28-bit bundles 8 to 11
1885 words = simd_batch{ SafeLoad<uint32_t>(in + 7), SafeLoad<uint32_t>(in + 7) >> 28 | SafeLoad<uint32_t>(in + 8) << 4, SafeLoad<uint32_t>(in + 8) >> 24 | SafeLoad<uint32_t>(in + 9) << 8, SafeLoad<uint32_t>(in + 9) >> 20 | SafeLoad<uint32_t>(in + 10) << 12 };
1886 shifts = simd_batch{ 0, 0, 0, 0 };
1887 results = (words >> shifts) & masks;
1888 results.store_unaligned(out);
1889 out += 4;
1890
1891 // extract 28-bit bundles 12 to 15
1892 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 16 | SafeLoad<uint32_t>(in + 11) << 16, SafeLoad<uint32_t>(in + 11) >> 12 | SafeLoad<uint32_t>(in + 12) << 20, SafeLoad<uint32_t>(in + 12) >> 8 | SafeLoad<uint32_t>(in + 13) << 24, SafeLoad<uint32_t>(in + 13) };
1893 shifts = simd_batch{ 0, 0, 0, 4 };
1894 results = (words >> shifts) & masks;
1895 results.store_unaligned(out);
1896 out += 4;
1897
1898 // extract 28-bit bundles 16 to 19
1899 words = simd_batch{ SafeLoad<uint32_t>(in + 14), SafeLoad<uint32_t>(in + 14) >> 28 | SafeLoad<uint32_t>(in + 15) << 4, SafeLoad<uint32_t>(in + 15) >> 24 | SafeLoad<uint32_t>(in + 16) << 8, SafeLoad<uint32_t>(in + 16) >> 20 | SafeLoad<uint32_t>(in + 17) << 12 };
1900 shifts = simd_batch{ 0, 0, 0, 0 };
1901 results = (words >> shifts) & masks;
1902 results.store_unaligned(out);
1903 out += 4;
1904
1905 // extract 28-bit bundles 20 to 23
1906 words = simd_batch{ SafeLoad<uint32_t>(in + 17) >> 16 | SafeLoad<uint32_t>(in + 18) << 16, SafeLoad<uint32_t>(in + 18) >> 12 | SafeLoad<uint32_t>(in + 19) << 20, SafeLoad<uint32_t>(in + 19) >> 8 | SafeLoad<uint32_t>(in + 20) << 24, SafeLoad<uint32_t>(in + 20) };
1907 shifts = simd_batch{ 0, 0, 0, 4 };
1908 results = (words >> shifts) & masks;
1909 results.store_unaligned(out);
1910 out += 4;
1911
1912 // extract 28-bit bundles 24 to 27
1913 words = simd_batch{ SafeLoad<uint32_t>(in + 21), SafeLoad<uint32_t>(in + 21) >> 28 | SafeLoad<uint32_t>(in + 22) << 4, SafeLoad<uint32_t>(in + 22) >> 24 | SafeLoad<uint32_t>(in + 23) << 8, SafeLoad<uint32_t>(in + 23) >> 20 | SafeLoad<uint32_t>(in + 24) << 12 };
1914 shifts = simd_batch{ 0, 0, 0, 0 };
1915 results = (words >> shifts) & masks;
1916 results.store_unaligned(out);
1917 out += 4;
1918
1919 // extract 28-bit bundles 28 to 31
1920 words = simd_batch{ SafeLoad<uint32_t>(in + 24) >> 16 | SafeLoad<uint32_t>(in + 25) << 16, SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) };
1921 shifts = simd_batch{ 0, 0, 0, 4 };
1922 results = (words >> shifts) & masks;
1923 results.store_unaligned(out);
1924 out += 4;
1925
1926 in += 28;
1927 return in;
1928}
1929
1930inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
1931 uint32_t mask = 0x1fffffff;
1932
1933 simd_batch masks(mask);
1934 simd_batch words, shifts;
1935 simd_batch results;
1936
1937 // extract 29-bit bundles 0 to 3
1938 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 29 | SafeLoad<uint32_t>(in + 1) << 3, SafeLoad<uint32_t>(in + 1) >> 26 | SafeLoad<uint32_t>(in + 2) << 6, SafeLoad<uint32_t>(in + 2) >> 23 | SafeLoad<uint32_t>(in + 3) << 9 };
1939 shifts = simd_batch{ 0, 0, 0, 0 };
1940 results = (words >> shifts) & masks;
1941 results.store_unaligned(out);
1942 out += 4;
1943
1944 // extract 29-bit bundles 4 to 7
1945 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 20 | SafeLoad<uint32_t>(in + 4) << 12, SafeLoad<uint32_t>(in + 4) >> 17 | SafeLoad<uint32_t>(in + 5) << 15, SafeLoad<uint32_t>(in + 5) >> 14 | SafeLoad<uint32_t>(in + 6) << 18, SafeLoad<uint32_t>(in + 6) >> 11 | SafeLoad<uint32_t>(in + 7) << 21 };
1946 shifts = simd_batch{ 0, 0, 0, 0 };
1947 results = (words >> shifts) & masks;
1948 results.store_unaligned(out);
1949 out += 4;
1950
1951 // extract 29-bit bundles 8 to 11
1952 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 8 | SafeLoad<uint32_t>(in + 8) << 24, SafeLoad<uint32_t>(in + 8) >> 5 | SafeLoad<uint32_t>(in + 9) << 27, SafeLoad<uint32_t>(in + 9), SafeLoad<uint32_t>(in + 9) >> 31 | SafeLoad<uint32_t>(in + 10) << 1 };
1953 shifts = simd_batch{ 0, 0, 2, 0 };
1954 results = (words >> shifts) & masks;
1955 results.store_unaligned(out);
1956 out += 4;
1957
1958 // extract 29-bit bundles 12 to 15
1959 words = simd_batch{ SafeLoad<uint32_t>(in + 10) >> 28 | SafeLoad<uint32_t>(in + 11) << 4, SafeLoad<uint32_t>(in + 11) >> 25 | SafeLoad<uint32_t>(in + 12) << 7, SafeLoad<uint32_t>(in + 12) >> 22 | SafeLoad<uint32_t>(in + 13) << 10, SafeLoad<uint32_t>(in + 13) >> 19 | SafeLoad<uint32_t>(in + 14) << 13 };
1960 shifts = simd_batch{ 0, 0, 0, 0 };
1961 results = (words >> shifts) & masks;
1962 results.store_unaligned(out);
1963 out += 4;
1964
1965 // extract 29-bit bundles 16 to 19
1966 words = simd_batch{ SafeLoad<uint32_t>(in + 14) >> 16 | SafeLoad<uint32_t>(in + 15) << 16, SafeLoad<uint32_t>(in + 15) >> 13 | SafeLoad<uint32_t>(in + 16) << 19, SafeLoad<uint32_t>(in + 16) >> 10 | SafeLoad<uint32_t>(in + 17) << 22, SafeLoad<uint32_t>(in + 17) >> 7 | SafeLoad<uint32_t>(in + 18) << 25 };
1967 shifts = simd_batch{ 0, 0, 0, 0 };
1968 results = (words >> shifts) & masks;
1969 results.store_unaligned(out);
1970 out += 4;
1971
1972 // extract 29-bit bundles 20 to 23
1973 words = simd_batch{ SafeLoad<uint32_t>(in + 18) >> 4 | SafeLoad<uint32_t>(in + 19) << 28, SafeLoad<uint32_t>(in + 19), SafeLoad<uint32_t>(in + 19) >> 30 | SafeLoad<uint32_t>(in + 20) << 2, SafeLoad<uint32_t>(in + 20) >> 27 | SafeLoad<uint32_t>(in + 21) << 5 };
1974 shifts = simd_batch{ 0, 1, 0, 0 };
1975 results = (words >> shifts) & masks;
1976 results.store_unaligned(out);
1977 out += 4;
1978
1979 // extract 29-bit bundles 24 to 27
1980 words = simd_batch{ SafeLoad<uint32_t>(in + 21) >> 24 | SafeLoad<uint32_t>(in + 22) << 8, SafeLoad<uint32_t>(in + 22) >> 21 | SafeLoad<uint32_t>(in + 23) << 11, SafeLoad<uint32_t>(in + 23) >> 18 | SafeLoad<uint32_t>(in + 24) << 14, SafeLoad<uint32_t>(in + 24) >> 15 | SafeLoad<uint32_t>(in + 25) << 17 };
1981 shifts = simd_batch{ 0, 0, 0, 0 };
1982 results = (words >> shifts) & masks;
1983 results.store_unaligned(out);
1984 out += 4;
1985
1986 // extract 29-bit bundles 28 to 31
1987 words = simd_batch{ SafeLoad<uint32_t>(in + 25) >> 12 | SafeLoad<uint32_t>(in + 26) << 20, SafeLoad<uint32_t>(in + 26) >> 9 | SafeLoad<uint32_t>(in + 27) << 23, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) };
1988 shifts = simd_batch{ 0, 0, 0, 3 };
1989 results = (words >> shifts) & masks;
1990 results.store_unaligned(out);
1991 out += 4;
1992
1993 in += 29;
1994 return in;
1995}
1996
1997inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
1998 uint32_t mask = 0x3fffffff;
1999
2000 simd_batch masks(mask);
2001 simd_batch words, shifts;
2002 simd_batch results;
2003
2004 // extract 30-bit bundles 0 to 3
2005 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 30 | SafeLoad<uint32_t>(in + 1) << 2, SafeLoad<uint32_t>(in + 1) >> 28 | SafeLoad<uint32_t>(in + 2) << 4, SafeLoad<uint32_t>(in + 2) >> 26 | SafeLoad<uint32_t>(in + 3) << 6 };
2006 shifts = simd_batch{ 0, 0, 0, 0 };
2007 results = (words >> shifts) & masks;
2008 results.store_unaligned(out);
2009 out += 4;
2010
2011 // extract 30-bit bundles 4 to 7
2012 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 24 | SafeLoad<uint32_t>(in + 4) << 8, SafeLoad<uint32_t>(in + 4) >> 22 | SafeLoad<uint32_t>(in + 5) << 10, SafeLoad<uint32_t>(in + 5) >> 20 | SafeLoad<uint32_t>(in + 6) << 12, SafeLoad<uint32_t>(in + 6) >> 18 | SafeLoad<uint32_t>(in + 7) << 14 };
2013 shifts = simd_batch{ 0, 0, 0, 0 };
2014 results = (words >> shifts) & masks;
2015 results.store_unaligned(out);
2016 out += 4;
2017
2018 // extract 30-bit bundles 8 to 11
2019 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 16 | SafeLoad<uint32_t>(in + 8) << 16, SafeLoad<uint32_t>(in + 8) >> 14 | SafeLoad<uint32_t>(in + 9) << 18, SafeLoad<uint32_t>(in + 9) >> 12 | SafeLoad<uint32_t>(in + 10) << 20, SafeLoad<uint32_t>(in + 10) >> 10 | SafeLoad<uint32_t>(in + 11) << 22 };
2020 shifts = simd_batch{ 0, 0, 0, 0 };
2021 results = (words >> shifts) & masks;
2022 results.store_unaligned(out);
2023 out += 4;
2024
2025 // extract 30-bit bundles 12 to 15
2026 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 8 | SafeLoad<uint32_t>(in + 12) << 24, SafeLoad<uint32_t>(in + 12) >> 6 | SafeLoad<uint32_t>(in + 13) << 26, SafeLoad<uint32_t>(in + 13) >> 4 | SafeLoad<uint32_t>(in + 14) << 28, SafeLoad<uint32_t>(in + 14) };
2027 shifts = simd_batch{ 0, 0, 0, 2 };
2028 results = (words >> shifts) & masks;
2029 results.store_unaligned(out);
2030 out += 4;
2031
2032 // extract 30-bit bundles 16 to 19
2033 words = simd_batch{ SafeLoad<uint32_t>(in + 15), SafeLoad<uint32_t>(in + 15) >> 30 | SafeLoad<uint32_t>(in + 16) << 2, SafeLoad<uint32_t>(in + 16) >> 28 | SafeLoad<uint32_t>(in + 17) << 4, SafeLoad<uint32_t>(in + 17) >> 26 | SafeLoad<uint32_t>(in + 18) << 6 };
2034 shifts = simd_batch{ 0, 0, 0, 0 };
2035 results = (words >> shifts) & masks;
2036 results.store_unaligned(out);
2037 out += 4;
2038
2039 // extract 30-bit bundles 20 to 23
2040 words = simd_batch{ SafeLoad<uint32_t>(in + 18) >> 24 | SafeLoad<uint32_t>(in + 19) << 8, SafeLoad<uint32_t>(in + 19) >> 22 | SafeLoad<uint32_t>(in + 20) << 10, SafeLoad<uint32_t>(in + 20) >> 20 | SafeLoad<uint32_t>(in + 21) << 12, SafeLoad<uint32_t>(in + 21) >> 18 | SafeLoad<uint32_t>(in + 22) << 14 };
2041 shifts = simd_batch{ 0, 0, 0, 0 };
2042 results = (words >> shifts) & masks;
2043 results.store_unaligned(out);
2044 out += 4;
2045
2046 // extract 30-bit bundles 24 to 27
2047 words = simd_batch{ SafeLoad<uint32_t>(in + 22) >> 16 | SafeLoad<uint32_t>(in + 23) << 16, SafeLoad<uint32_t>(in + 23) >> 14 | SafeLoad<uint32_t>(in + 24) << 18, SafeLoad<uint32_t>(in + 24) >> 12 | SafeLoad<uint32_t>(in + 25) << 20, SafeLoad<uint32_t>(in + 25) >> 10 | SafeLoad<uint32_t>(in + 26) << 22 };
2048 shifts = simd_batch{ 0, 0, 0, 0 };
2049 results = (words >> shifts) & masks;
2050 results.store_unaligned(out);
2051 out += 4;
2052
2053 // extract 30-bit bundles 28 to 31
2054 words = simd_batch{ SafeLoad<uint32_t>(in + 26) >> 8 | SafeLoad<uint32_t>(in + 27) << 24, SafeLoad<uint32_t>(in + 27) >> 6 | SafeLoad<uint32_t>(in + 28) << 26, SafeLoad<uint32_t>(in + 28) >> 4 | SafeLoad<uint32_t>(in + 29) << 28, SafeLoad<uint32_t>(in + 29) };
2055 shifts = simd_batch{ 0, 0, 0, 2 };
2056 results = (words >> shifts) & masks;
2057 results.store_unaligned(out);
2058 out += 4;
2059
2060 in += 30;
2061 return in;
2062}
2063
2064inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
2065 uint32_t mask = 0x7fffffff;
2066
2067 simd_batch masks(mask);
2068 simd_batch words, shifts;
2069 simd_batch results;
2070
2071 // extract 31-bit bundles 0 to 3
2072 words = simd_batch{ SafeLoad<uint32_t>(in + 0), SafeLoad<uint32_t>(in + 0) >> 31 | SafeLoad<uint32_t>(in + 1) << 1, SafeLoad<uint32_t>(in + 1) >> 30 | SafeLoad<uint32_t>(in + 2) << 2, SafeLoad<uint32_t>(in + 2) >> 29 | SafeLoad<uint32_t>(in + 3) << 3 };
2073 shifts = simd_batch{ 0, 0, 0, 0 };
2074 results = (words >> shifts) & masks;
2075 results.store_unaligned(out);
2076 out += 4;
2077
2078 // extract 31-bit bundles 4 to 7
2079 words = simd_batch{ SafeLoad<uint32_t>(in + 3) >> 28 | SafeLoad<uint32_t>(in + 4) << 4, SafeLoad<uint32_t>(in + 4) >> 27 | SafeLoad<uint32_t>(in + 5) << 5, SafeLoad<uint32_t>(in + 5) >> 26 | SafeLoad<uint32_t>(in + 6) << 6, SafeLoad<uint32_t>(in + 6) >> 25 | SafeLoad<uint32_t>(in + 7) << 7 };
2080 shifts = simd_batch{ 0, 0, 0, 0 };
2081 results = (words >> shifts) & masks;
2082 results.store_unaligned(out);
2083 out += 4;
2084
2085 // extract 31-bit bundles 8 to 11
2086 words = simd_batch{ SafeLoad<uint32_t>(in + 7) >> 24 | SafeLoad<uint32_t>(in + 8) << 8, SafeLoad<uint32_t>(in + 8) >> 23 | SafeLoad<uint32_t>(in + 9) << 9, SafeLoad<uint32_t>(in + 9) >> 22 | SafeLoad<uint32_t>(in + 10) << 10, SafeLoad<uint32_t>(in + 10) >> 21 | SafeLoad<uint32_t>(in + 11) << 11 };
2087 shifts = simd_batch{ 0, 0, 0, 0 };
2088 results = (words >> shifts) & masks;
2089 results.store_unaligned(out);
2090 out += 4;
2091
2092 // extract 31-bit bundles 12 to 15
2093 words = simd_batch{ SafeLoad<uint32_t>(in + 11) >> 20 | SafeLoad<uint32_t>(in + 12) << 12, SafeLoad<uint32_t>(in + 12) >> 19 | SafeLoad<uint32_t>(in + 13) << 13, SafeLoad<uint32_t>(in + 13) >> 18 | SafeLoad<uint32_t>(in + 14) << 14, SafeLoad<uint32_t>(in + 14) >> 17 | SafeLoad<uint32_t>(in + 15) << 15 };
2094 shifts = simd_batch{ 0, 0, 0, 0 };
2095 results = (words >> shifts) & masks;
2096 results.store_unaligned(out);
2097 out += 4;
2098
2099 // extract 31-bit bundles 16 to 19
2100 words = simd_batch{ SafeLoad<uint32_t>(in + 15) >> 16 | SafeLoad<uint32_t>(in + 16) << 16, SafeLoad<uint32_t>(in + 16) >> 15 | SafeLoad<uint32_t>(in + 17) << 17, SafeLoad<uint32_t>(in + 17) >> 14 | SafeLoad<uint32_t>(in + 18) << 18, SafeLoad<uint32_t>(in + 18) >> 13 | SafeLoad<uint32_t>(in + 19) << 19 };
2101 shifts = simd_batch{ 0, 0, 0, 0 };
2102 results = (words >> shifts) & masks;
2103 results.store_unaligned(out);
2104 out += 4;
2105
2106 // extract 31-bit bundles 20 to 23
2107 words = simd_batch{ SafeLoad<uint32_t>(in + 19) >> 12 | SafeLoad<uint32_t>(in + 20) << 20, SafeLoad<uint32_t>(in + 20) >> 11 | SafeLoad<uint32_t>(in + 21) << 21, SafeLoad<uint32_t>(in + 21) >> 10 | SafeLoad<uint32_t>(in + 22) << 22, SafeLoad<uint32_t>(in + 22) >> 9 | SafeLoad<uint32_t>(in + 23) << 23 };
2108 shifts = simd_batch{ 0, 0, 0, 0 };
2109 results = (words >> shifts) & masks;
2110 results.store_unaligned(out);
2111 out += 4;
2112
2113 // extract 31-bit bundles 24 to 27
2114 words = simd_batch{ SafeLoad<uint32_t>(in + 23) >> 8 | SafeLoad<uint32_t>(in + 24) << 24, SafeLoad<uint32_t>(in + 24) >> 7 | SafeLoad<uint32_t>(in + 25) << 25, SafeLoad<uint32_t>(in + 25) >> 6 | SafeLoad<uint32_t>(in + 26) << 26, SafeLoad<uint32_t>(in + 26) >> 5 | SafeLoad<uint32_t>(in + 27) << 27 };
2115 shifts = simd_batch{ 0, 0, 0, 0 };
2116 results = (words >> shifts) & masks;
2117 results.store_unaligned(out);
2118 out += 4;
2119
2120 // extract 31-bit bundles 28 to 31
2121 words = simd_batch{ SafeLoad<uint32_t>(in + 27) >> 4 | SafeLoad<uint32_t>(in + 28) << 28, SafeLoad<uint32_t>(in + 28) >> 3 | SafeLoad<uint32_t>(in + 29) << 29, SafeLoad<uint32_t>(in + 29) >> 2 | SafeLoad<uint32_t>(in + 30) << 30, SafeLoad<uint32_t>(in + 30) };
2122 shifts = simd_batch{ 0, 0, 0, 1 };
2123 results = (words >> shifts) & masks;
2124 results.store_unaligned(out);
2125 out += 4;
2126
2127 in += 31;
2128 return in;
2129}
2130
2131inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
2132 memcpy(out, in, 32 * sizeof(*out));
2133 in += 32;
2134 out += 32;
2135
2136 return in;
2137}
2138
2139}; // struct UnpackBits128
2140
2141} // namespace
2142} // namespace internal
2143} // namespace arrow
2144