]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/util/bpacking_simd256_generated.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 // Automatically generated file; DO NOT EDIT.
25 #include <xsimd/xsimd.hpp>
27 #include "arrow/util/dispatch.h"
28 #include "arrow/util/ubsan.h"
34 using ::arrow::util::SafeLoad
;
36 template <DispatchLevel level
>
37 struct UnpackBits256
{
39 using simd_arch
= xsimd::avx2
;
40 using simd_batch
= xsimd::batch
<uint32_t, simd_arch
>;
42 inline static const uint32_t* unpack0_32(const uint32_t* in
, uint32_t* out
) {
43 memset(out
, 0x0, 32 * sizeof(*out
));
49 inline static const uint32_t* unpack1_32(const uint32_t* in
, uint32_t* out
) {
52 simd_batch
masks(mask
);
53 simd_batch words
, shifts
;
56 // extract 1-bit bundles 0 to 7
57 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
58 shifts
= simd_batch
{ 0, 1, 2, 3, 4, 5, 6, 7 };
59 results
= (words
>> shifts
) & masks
;
60 results
.store_unaligned(out
);
63 // extract 1-bit bundles 8 to 15
64 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
65 shifts
= simd_batch
{ 8, 9, 10, 11, 12, 13, 14, 15 };
66 results
= (words
>> shifts
) & masks
;
67 results
.store_unaligned(out
);
70 // extract 1-bit bundles 16 to 23
71 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
72 shifts
= simd_batch
{ 16, 17, 18, 19, 20, 21, 22, 23 };
73 results
= (words
>> shifts
) & masks
;
74 results
.store_unaligned(out
);
77 // extract 1-bit bundles 24 to 31
78 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
79 shifts
= simd_batch
{ 24, 25, 26, 27, 28, 29, 30, 31 };
80 results
= (words
>> shifts
) & masks
;
81 results
.store_unaligned(out
);
88 inline static const uint32_t* unpack2_32(const uint32_t* in
, uint32_t* out
) {
91 simd_batch
masks(mask
);
92 simd_batch words
, shifts
;
95 // extract 2-bit bundles 0 to 7
96 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
97 shifts
= simd_batch
{ 0, 2, 4, 6, 8, 10, 12, 14 };
98 results
= (words
>> shifts
) & masks
;
99 results
.store_unaligned(out
);
102 // extract 2-bit bundles 8 to 15
103 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
104 shifts
= simd_batch
{ 16, 18, 20, 22, 24, 26, 28, 30 };
105 results
= (words
>> shifts
) & masks
;
106 results
.store_unaligned(out
);
109 // extract 2-bit bundles 16 to 23
110 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
111 shifts
= simd_batch
{ 0, 2, 4, 6, 8, 10, 12, 14 };
112 results
= (words
>> shifts
) & masks
;
113 results
.store_unaligned(out
);
116 // extract 2-bit bundles 24 to 31
117 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
118 shifts
= simd_batch
{ 16, 18, 20, 22, 24, 26, 28, 30 };
119 results
= (words
>> shifts
) & masks
;
120 results
.store_unaligned(out
);
127 inline static const uint32_t* unpack3_32(const uint32_t* in
, uint32_t* out
) {
130 simd_batch
masks(mask
);
131 simd_batch words
, shifts
;
134 // extract 3-bit bundles 0 to 7
135 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
136 shifts
= simd_batch
{ 0, 3, 6, 9, 12, 15, 18, 21 };
137 results
= (words
>> shifts
) & masks
;
138 results
.store_unaligned(out
);
141 // extract 3-bit bundles 8 to 15
142 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
143 shifts
= simd_batch
{ 24, 27, 0, 1, 4, 7, 10, 13 };
144 results
= (words
>> shifts
) & masks
;
145 results
.store_unaligned(out
);
148 // extract 3-bit bundles 16 to 23
149 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 31 | SafeLoad
<uint32_t>(in
+ 2) << 1, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
150 shifts
= simd_batch
{ 16, 19, 22, 25, 28, 0, 2, 5 };
151 results
= (words
>> shifts
) & masks
;
152 results
.store_unaligned(out
);
155 // extract 3-bit bundles 24 to 31
156 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
157 shifts
= simd_batch
{ 8, 11, 14, 17, 20, 23, 26, 29 };
158 results
= (words
>> shifts
) & masks
;
159 results
.store_unaligned(out
);
166 inline static const uint32_t* unpack4_32(const uint32_t* in
, uint32_t* out
) {
169 simd_batch
masks(mask
);
170 simd_batch words
, shifts
;
173 // extract 4-bit bundles 0 to 7
174 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) };
175 shifts
= simd_batch
{ 0, 4, 8, 12, 16, 20, 24, 28 };
176 results
= (words
>> shifts
) & masks
;
177 results
.store_unaligned(out
);
180 // extract 4-bit bundles 8 to 15
181 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
182 shifts
= simd_batch
{ 0, 4, 8, 12, 16, 20, 24, 28 };
183 results
= (words
>> shifts
) & masks
;
184 results
.store_unaligned(out
);
187 // extract 4-bit bundles 16 to 23
188 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
189 shifts
= simd_batch
{ 0, 4, 8, 12, 16, 20, 24, 28 };
190 results
= (words
>> shifts
) & masks
;
191 results
.store_unaligned(out
);
194 // extract 4-bit bundles 24 to 31
195 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) };
196 shifts
= simd_batch
{ 0, 4, 8, 12, 16, 20, 24, 28 };
197 results
= (words
>> shifts
) & masks
;
198 results
.store_unaligned(out
);
205 inline static const uint32_t* unpack5_32(const uint32_t* in
, uint32_t* out
) {
206 uint32_t mask
= 0x1f;
208 simd_batch
masks(mask
);
209 simd_batch words
, shifts
;
212 // extract 5-bit bundles 0 to 7
213 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1) };
214 shifts
= simd_batch
{ 0, 5, 10, 15, 20, 25, 0, 3 };
215 results
= (words
>> shifts
) & masks
;
216 results
.store_unaligned(out
);
219 // extract 5-bit bundles 8 to 15
220 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
221 shifts
= simd_batch
{ 8, 13, 18, 23, 0, 1, 6, 11 };
222 results
= (words
>> shifts
) & masks
;
223 results
.store_unaligned(out
);
226 // extract 5-bit bundles 16 to 23
227 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 31 | SafeLoad
<uint32_t>(in
+ 3) << 1, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) };
228 shifts
= simd_batch
{ 16, 21, 26, 0, 4, 9, 14, 19 };
229 results
= (words
>> shifts
) & masks
;
230 results
.store_unaligned(out
);
233 // extract 5-bit bundles 24 to 31
234 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 29 | SafeLoad
<uint32_t>(in
+ 4) << 3, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) };
235 shifts
= simd_batch
{ 24, 0, 2, 7, 12, 17, 22, 27 };
236 results
= (words
>> shifts
) & masks
;
237 results
.store_unaligned(out
);
244 inline static const uint32_t* unpack6_32(const uint32_t* in
, uint32_t* out
) {
245 uint32_t mask
= 0x3f;
247 simd_batch
masks(mask
);
248 simd_batch words
, shifts
;
251 // extract 6-bit bundles 0 to 7
252 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
253 shifts
= simd_batch
{ 0, 6, 12, 18, 24, 0, 4, 10 };
254 results
= (words
>> shifts
) & masks
;
255 results
.store_unaligned(out
);
258 // extract 6-bit bundles 8 to 15
259 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
260 shifts
= simd_batch
{ 16, 22, 0, 2, 8, 14, 20, 26 };
261 results
= (words
>> shifts
) & masks
;
262 results
.store_unaligned(out
);
265 // extract 6-bit bundles 16 to 23
266 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) };
267 shifts
= simd_batch
{ 0, 6, 12, 18, 24, 0, 4, 10 };
268 results
= (words
>> shifts
) & masks
;
269 results
.store_unaligned(out
);
272 // extract 6-bit bundles 24 to 31
273 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 28 | SafeLoad
<uint32_t>(in
+ 5) << 4, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) };
274 shifts
= simd_batch
{ 16, 22, 0, 2, 8, 14, 20, 26 };
275 results
= (words
>> shifts
) & masks
;
276 results
.store_unaligned(out
);
283 inline static const uint32_t* unpack7_32(const uint32_t* in
, uint32_t* out
) {
284 uint32_t mask
= 0x7f;
286 simd_batch
masks(mask
);
287 simd_batch words
, shifts
;
290 // extract 7-bit bundles 0 to 7
291 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 28 | SafeLoad
<uint32_t>(in
+ 1) << 4, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
292 shifts
= simd_batch
{ 0, 7, 14, 21, 0, 3, 10, 17 };
293 results
= (words
>> shifts
) & masks
;
294 results
.store_unaligned(out
);
297 // extract 7-bit bundles 8 to 15
298 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 31 | SafeLoad
<uint32_t>(in
+ 2) << 1, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 27 | SafeLoad
<uint32_t>(in
+ 3) << 5, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) };
299 shifts
= simd_batch
{ 24, 0, 6, 13, 20, 0, 2, 9 };
300 results
= (words
>> shifts
) & masks
;
301 results
.store_unaligned(out
);
304 // extract 7-bit bundles 16 to 23
305 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 26 | SafeLoad
<uint32_t>(in
+ 5) << 6, SafeLoad
<uint32_t>(in
+ 5) };
306 shifts
= simd_batch
{ 16, 23, 0, 5, 12, 19, 0, 1 };
307 results
= (words
>> shifts
) & masks
;
308 results
.store_unaligned(out
);
311 // extract 7-bit bundles 24 to 31
312 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 29 | SafeLoad
<uint32_t>(in
+ 6) << 3, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) };
313 shifts
= simd_batch
{ 8, 15, 22, 0, 4, 11, 18, 25 };
314 results
= (words
>> shifts
) & masks
;
315 results
.store_unaligned(out
);
322 inline static const uint32_t* unpack8_32(const uint32_t* in
, uint32_t* out
) {
323 uint32_t mask
= 0xff;
325 simd_batch
masks(mask
);
326 simd_batch words
, shifts
;
329 // extract 8-bit bundles 0 to 7
330 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) };
331 shifts
= simd_batch
{ 0, 8, 16, 24, 0, 8, 16, 24 };
332 results
= (words
>> shifts
) & masks
;
333 results
.store_unaligned(out
);
336 // extract 8-bit bundles 8 to 15
337 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) };
338 shifts
= simd_batch
{ 0, 8, 16, 24, 0, 8, 16, 24 };
339 results
= (words
>> shifts
) & masks
;
340 results
.store_unaligned(out
);
343 // extract 8-bit bundles 16 to 23
344 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) };
345 shifts
= simd_batch
{ 0, 8, 16, 24, 0, 8, 16, 24 };
346 results
= (words
>> shifts
) & masks
;
347 results
.store_unaligned(out
);
350 // extract 8-bit bundles 24 to 31
351 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) };
352 shifts
= simd_batch
{ 0, 8, 16, 24, 0, 8, 16, 24 };
353 results
= (words
>> shifts
) & masks
;
354 results
.store_unaligned(out
);
361 inline static const uint32_t* unpack9_32(const uint32_t* in
, uint32_t* out
) {
362 uint32_t mask
= 0x1ff;
364 simd_batch
masks(mask
);
365 simd_batch words
, shifts
;
368 // extract 9-bit bundles 0 to 7
369 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 27 | SafeLoad
<uint32_t>(in
+ 1) << 5, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 31 | SafeLoad
<uint32_t>(in
+ 2) << 1 };
370 shifts
= simd_batch
{ 0, 9, 18, 0, 4, 13, 22, 0 };
371 results
= (words
>> shifts
) & masks
;
372 results
.store_unaligned(out
);
375 // extract 9-bit bundles 8 to 15
376 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 26 | SafeLoad
<uint32_t>(in
+ 3) << 6, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2, SafeLoad
<uint32_t>(in
+ 4) };
377 shifts
= simd_batch
{ 8, 17, 0, 3, 12, 21, 0, 7 };
378 results
= (words
>> shifts
) & masks
;
379 results
.store_unaligned(out
);
382 // extract 9-bit bundles 16 to 23
383 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 25 | SafeLoad
<uint32_t>(in
+ 5) << 7, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 29 | SafeLoad
<uint32_t>(in
+ 6) << 3, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) };
384 shifts
= simd_batch
{ 16, 0, 2, 11, 20, 0, 6, 15 };
385 results
= (words
>> shifts
) & masks
;
386 results
.store_unaligned(out
);
389 // extract 9-bit bundles 24 to 31
390 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6) >> 24 | SafeLoad
<uint32_t>(in
+ 7) << 8, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) };
391 shifts
= simd_batch
{ 0, 1, 10, 19, 0, 5, 14, 23 };
392 results
= (words
>> shifts
) & masks
;
393 results
.store_unaligned(out
);
400 inline static const uint32_t* unpack10_32(const uint32_t* in
, uint32_t* out
) {
401 uint32_t mask
= 0x3ff;
403 simd_batch
masks(mask
);
404 simd_batch words
, shifts
;
407 // extract 10-bit bundles 0 to 7
408 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2) };
409 shifts
= simd_batch
{ 0, 10, 20, 0, 8, 18, 0, 6 };
410 results
= (words
>> shifts
) & masks
;
411 results
.store_unaligned(out
);
414 // extract 10-bit bundles 8 to 15
415 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 26 | SafeLoad
<uint32_t>(in
+ 3) << 6, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) };
416 shifts
= simd_batch
{ 16, 0, 4, 14, 0, 2, 12, 22 };
417 results
= (words
>> shifts
) & masks
;
418 results
.store_unaligned(out
);
421 // extract 10-bit bundles 16 to 23
422 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 30 | SafeLoad
<uint32_t>(in
+ 6) << 2, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 28 | SafeLoad
<uint32_t>(in
+ 7) << 4, SafeLoad
<uint32_t>(in
+ 7) };
423 shifts
= simd_batch
{ 0, 10, 20, 0, 8, 18, 0, 6 };
424 results
= (words
>> shifts
) & masks
;
425 results
.store_unaligned(out
);
428 // extract 10-bit bundles 24 to 31
429 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 26 | SafeLoad
<uint32_t>(in
+ 8) << 6, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 24 | SafeLoad
<uint32_t>(in
+ 9) << 8, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) };
430 shifts
= simd_batch
{ 16, 0, 4, 14, 0, 2, 12, 22 };
431 results
= (words
>> shifts
) & masks
;
432 results
.store_unaligned(out
);
439 inline static const uint32_t* unpack11_32(const uint32_t* in
, uint32_t* out
) {
440 uint32_t mask
= 0x7ff;
442 simd_batch
masks(mask
);
443 simd_batch words
, shifts
;
446 // extract 11-bit bundles 0 to 7
447 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 22 | SafeLoad
<uint32_t>(in
+ 1) << 10, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 23 | SafeLoad
<uint32_t>(in
+ 2) << 9, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
448 shifts
= simd_batch
{ 0, 11, 0, 1, 12, 0, 2, 13 };
449 results
= (words
>> shifts
) & masks
;
450 results
.store_unaligned(out
);
453 // extract 11-bit bundles 8 to 15
454 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 2) >> 24 | SafeLoad
<uint32_t>(in
+ 3) << 8, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 25 | SafeLoad
<uint32_t>(in
+ 4) << 7, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 26 | SafeLoad
<uint32_t>(in
+ 5) << 6, SafeLoad
<uint32_t>(in
+ 5) };
455 shifts
= simd_batch
{ 0, 3, 14, 0, 4, 15, 0, 5 };
456 results
= (words
>> shifts
) & masks
;
457 results
.store_unaligned(out
);
460 // extract 11-bit bundles 16 to 23
461 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 27 | SafeLoad
<uint32_t>(in
+ 6) << 5, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 28 | SafeLoad
<uint32_t>(in
+ 7) << 4, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 29 | SafeLoad
<uint32_t>(in
+ 8) << 3 };
462 shifts
= simd_batch
{ 16, 0, 6, 17, 0, 7, 18, 0 };
463 results
= (words
>> shifts
) & masks
;
464 results
.store_unaligned(out
);
467 // extract 11-bit bundles 24 to 31
468 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 30 | SafeLoad
<uint32_t>(in
+ 9) << 2, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 31 | SafeLoad
<uint32_t>(in
+ 10) << 1, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) };
469 shifts
= simd_batch
{ 8, 19, 0, 9, 20, 0, 10, 21 };
470 results
= (words
>> shifts
) & masks
;
471 results
.store_unaligned(out
);
478 inline static const uint32_t* unpack12_32(const uint32_t* in
, uint32_t* out
) {
479 uint32_t mask
= 0xfff;
481 simd_batch
masks(mask
);
482 simd_batch words
, shifts
;
485 // extract 12-bit bundles 0 to 7
486 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 24 | SafeLoad
<uint32_t>(in
+ 1) << 8, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) };
487 shifts
= simd_batch
{ 0, 12, 0, 4, 16, 0, 8, 20 };
488 results
= (words
>> shifts
) & masks
;
489 results
.store_unaligned(out
);
492 // extract 12-bit bundles 8 to 15
493 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 28 | SafeLoad
<uint32_t>(in
+ 5) << 4, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) };
494 shifts
= simd_batch
{ 0, 12, 0, 4, 16, 0, 8, 20 };
495 results
= (words
>> shifts
) & masks
;
496 results
.store_unaligned(out
);
499 // extract 12-bit bundles 16 to 23
500 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 24 | SafeLoad
<uint32_t>(in
+ 7) << 8, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) };
501 shifts
= simd_batch
{ 0, 12, 0, 4, 16, 0, 8, 20 };
502 results
= (words
>> shifts
) & masks
;
503 results
.store_unaligned(out
);
506 // extract 12-bit bundles 24 to 31
507 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 24 | SafeLoad
<uint32_t>(in
+ 10) << 8, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 28 | SafeLoad
<uint32_t>(in
+ 11) << 4, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) };
508 shifts
= simd_batch
{ 0, 12, 0, 4, 16, 0, 8, 20 };
509 results
= (words
>> shifts
) & masks
;
510 results
.store_unaligned(out
);
517 inline static const uint32_t* unpack13_32(const uint32_t* in
, uint32_t* out
) {
518 uint32_t mask
= 0x1fff;
520 simd_batch
masks(mask
);
521 simd_batch words
, shifts
;
524 // extract 13-bit bundles 0 to 7
525 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 26 | SafeLoad
<uint32_t>(in
+ 1) << 6, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 20 | SafeLoad
<uint32_t>(in
+ 2) << 12, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 27 | SafeLoad
<uint32_t>(in
+ 3) << 5 };
526 shifts
= simd_batch
{ 0, 13, 0, 7, 0, 1, 14, 0 };
527 results
= (words
>> shifts
) & masks
;
528 results
.store_unaligned(out
);
531 // extract 13-bit bundles 8 to 15
532 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 21 | SafeLoad
<uint32_t>(in
+ 4) << 11, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 28 | SafeLoad
<uint32_t>(in
+ 5) << 4, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 22 | SafeLoad
<uint32_t>(in
+ 6) << 10, SafeLoad
<uint32_t>(in
+ 6) };
533 shifts
= simd_batch
{ 8, 0, 2, 15, 0, 9, 0, 3 };
534 results
= (words
>> shifts
) & masks
;
535 results
.store_unaligned(out
);
538 // extract 13-bit bundles 16 to 23
539 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 29 | SafeLoad
<uint32_t>(in
+ 7) << 3, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 23 | SafeLoad
<uint32_t>(in
+ 8) << 9, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 30 | SafeLoad
<uint32_t>(in
+ 9) << 2, SafeLoad
<uint32_t>(in
+ 9) };
540 shifts
= simd_batch
{ 16, 0, 10, 0, 4, 17, 0, 11 };
541 results
= (words
>> shifts
) & masks
;
542 results
.store_unaligned(out
);
545 // extract 13-bit bundles 24 to 31
546 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 9) >> 24 | SafeLoad
<uint32_t>(in
+ 10) << 8, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 31 | SafeLoad
<uint32_t>(in
+ 11) << 1, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 25 | SafeLoad
<uint32_t>(in
+ 12) << 7, SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) };
547 shifts
= simd_batch
{ 0, 5, 18, 0, 12, 0, 6, 19 };
548 results
= (words
>> shifts
) & masks
;
549 results
.store_unaligned(out
);
556 inline static const uint32_t* unpack14_32(const uint32_t* in
, uint32_t* out
) {
557 uint32_t mask
= 0x3fff;
559 simd_batch
masks(mask
);
560 simd_batch words
, shifts
;
563 // extract 14-bit bundles 0 to 7
564 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 28 | SafeLoad
<uint32_t>(in
+ 1) << 4, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 24 | SafeLoad
<uint32_t>(in
+ 2) << 8, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 20 | SafeLoad
<uint32_t>(in
+ 3) << 12, SafeLoad
<uint32_t>(in
+ 3) };
565 shifts
= simd_batch
{ 0, 14, 0, 10, 0, 6, 0, 2 };
566 results
= (words
>> shifts
) & masks
;
567 results
.store_unaligned(out
);
570 // extract 14-bit bundles 8 to 15
571 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 26 | SafeLoad
<uint32_t>(in
+ 5) << 6, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 22 | SafeLoad
<uint32_t>(in
+ 6) << 10, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) };
572 shifts
= simd_batch
{ 16, 0, 12, 0, 8, 0, 4, 18 };
573 results
= (words
>> shifts
) & masks
;
574 results
.store_unaligned(out
);
577 // extract 14-bit bundles 16 to 23
578 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 24 | SafeLoad
<uint32_t>(in
+ 9) << 8, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 20 | SafeLoad
<uint32_t>(in
+ 10) << 12, SafeLoad
<uint32_t>(in
+ 10) };
579 shifts
= simd_batch
{ 0, 14, 0, 10, 0, 6, 0, 2 };
580 results
= (words
>> shifts
) & masks
;
581 results
.store_unaligned(out
);
584 // extract 14-bit bundles 24 to 31
585 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 30 | SafeLoad
<uint32_t>(in
+ 11) << 2, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 26 | SafeLoad
<uint32_t>(in
+ 12) << 6, SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) >> 22 | SafeLoad
<uint32_t>(in
+ 13) << 10, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) };
586 shifts
= simd_batch
{ 16, 0, 12, 0, 8, 0, 4, 18 };
587 results
= (words
>> shifts
) & masks
;
588 results
.store_unaligned(out
);
595 inline static const uint32_t* unpack15_32(const uint32_t* in
, uint32_t* out
) {
596 uint32_t mask
= 0x7fff;
598 simd_batch
masks(mask
);
599 simd_batch words
, shifts
;
602 // extract 15-bit bundles 0 to 7
603 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 26 | SafeLoad
<uint32_t>(in
+ 3) << 6, SafeLoad
<uint32_t>(in
+ 3) };
604 shifts
= simd_batch
{ 0, 15, 0, 13, 0, 11, 0, 9 };
605 results
= (words
>> shifts
) & masks
;
606 results
.store_unaligned(out
);
609 // extract 15-bit bundles 8 to 15
610 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 22 | SafeLoad
<uint32_t>(in
+ 5) << 10, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 20 | SafeLoad
<uint32_t>(in
+ 6) << 12, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 18 | SafeLoad
<uint32_t>(in
+ 7) << 14, SafeLoad
<uint32_t>(in
+ 7) };
611 shifts
= simd_batch
{ 0, 7, 0, 5, 0, 3, 0, 1 };
612 results
= (words
>> shifts
) & masks
;
613 results
.store_unaligned(out
);
616 // extract 15-bit bundles 16 to 23
617 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 31 | SafeLoad
<uint32_t>(in
+ 8) << 1, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 29 | SafeLoad
<uint32_t>(in
+ 9) << 3, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 27 | SafeLoad
<uint32_t>(in
+ 10) << 5, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 25 | SafeLoad
<uint32_t>(in
+ 11) << 7 };
618 shifts
= simd_batch
{ 16, 0, 14, 0, 12, 0, 10, 0 };
619 results
= (words
>> shifts
) & masks
;
620 results
.store_unaligned(out
);
623 // extract 15-bit bundles 24 to 31
624 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 23 | SafeLoad
<uint32_t>(in
+ 12) << 9, SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) >> 21 | SafeLoad
<uint32_t>(in
+ 13) << 11, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 19 | SafeLoad
<uint32_t>(in
+ 14) << 13, SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) };
625 shifts
= simd_batch
{ 8, 0, 6, 0, 4, 0, 2, 17 };
626 results
= (words
>> shifts
) & masks
;
627 results
.store_unaligned(out
);
634 inline static const uint32_t* unpack16_32(const uint32_t* in
, uint32_t* out
) {
635 uint32_t mask
= 0xffff;
637 simd_batch
masks(mask
);
638 simd_batch words
, shifts
;
641 // extract 16-bit bundles 0 to 7
642 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) };
643 shifts
= simd_batch
{ 0, 16, 0, 16, 0, 16, 0, 16 };
644 results
= (words
>> shifts
) & masks
;
645 results
.store_unaligned(out
);
648 // extract 16-bit bundles 8 to 15
649 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) };
650 shifts
= simd_batch
{ 0, 16, 0, 16, 0, 16, 0, 16 };
651 results
= (words
>> shifts
) & masks
;
652 results
.store_unaligned(out
);
655 // extract 16-bit bundles 16 to 23
656 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) };
657 shifts
= simd_batch
{ 0, 16, 0, 16, 0, 16, 0, 16 };
658 results
= (words
>> shifts
) & masks
;
659 results
.store_unaligned(out
);
662 // extract 16-bit bundles 24 to 31
663 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) };
664 shifts
= simd_batch
{ 0, 16, 0, 16, 0, 16, 0, 16 };
665 results
= (words
>> shifts
) & masks
;
666 results
.store_unaligned(out
);
673 inline static const uint32_t* unpack17_32(const uint32_t* in
, uint32_t* out
) {
674 uint32_t mask
= 0x1ffff;
676 simd_batch
masks(mask
);
677 simd_batch words
, shifts
;
680 // extract 17-bit bundles 0 to 7
681 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 17 | SafeLoad
<uint32_t>(in
+ 1) << 15, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 19 | SafeLoad
<uint32_t>(in
+ 2) << 13, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 21 | SafeLoad
<uint32_t>(in
+ 3) << 11, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 23 | SafeLoad
<uint32_t>(in
+ 4) << 9 };
682 shifts
= simd_batch
{ 0, 0, 2, 0, 4, 0, 6, 0 };
683 results
= (words
>> shifts
) & masks
;
684 results
.store_unaligned(out
);
687 // extract 17-bit bundles 8 to 15
688 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 25 | SafeLoad
<uint32_t>(in
+ 5) << 7, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 27 | SafeLoad
<uint32_t>(in
+ 6) << 5, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 29 | SafeLoad
<uint32_t>(in
+ 7) << 3, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 31 | SafeLoad
<uint32_t>(in
+ 8) << 1 };
689 shifts
= simd_batch
{ 8, 0, 10, 0, 12, 0, 14, 0 };
690 results
= (words
>> shifts
) & masks
;
691 results
.store_unaligned(out
);
694 // extract 17-bit bundles 16 to 23
695 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 8) >> 16 | SafeLoad
<uint32_t>(in
+ 9) << 16, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 18 | SafeLoad
<uint32_t>(in
+ 10) << 14, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 20 | SafeLoad
<uint32_t>(in
+ 11) << 12, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 22 | SafeLoad
<uint32_t>(in
+ 12) << 10, SafeLoad
<uint32_t>(in
+ 12) };
696 shifts
= simd_batch
{ 0, 1, 0, 3, 0, 5, 0, 7 };
697 results
= (words
>> shifts
) & masks
;
698 results
.store_unaligned(out
);
701 // extract 17-bit bundles 24 to 31
702 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 12) >> 24 | SafeLoad
<uint32_t>(in
+ 13) << 8, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 26 | SafeLoad
<uint32_t>(in
+ 14) << 6, SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) >> 28 | SafeLoad
<uint32_t>(in
+ 15) << 4, SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 30 | SafeLoad
<uint32_t>(in
+ 16) << 2, SafeLoad
<uint32_t>(in
+ 16) };
703 shifts
= simd_batch
{ 0, 9, 0, 11, 0, 13, 0, 15 };
704 results
= (words
>> shifts
) & masks
;
705 results
.store_unaligned(out
);
712 inline static const uint32_t* unpack18_32(const uint32_t* in
, uint32_t* out
) {
713 uint32_t mask
= 0x3ffff;
715 simd_batch
masks(mask
);
716 simd_batch words
, shifts
;
719 // extract 18-bit bundles 0 to 7
720 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 18 | SafeLoad
<uint32_t>(in
+ 1) << 14, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 22 | SafeLoad
<uint32_t>(in
+ 2) << 10, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 26 | SafeLoad
<uint32_t>(in
+ 3) << 6, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2 };
721 shifts
= simd_batch
{ 0, 0, 4, 0, 8, 0, 12, 0 };
722 results
= (words
>> shifts
) & masks
;
723 results
.store_unaligned(out
);
726 // extract 18-bit bundles 8 to 15
727 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4) >> 16 | SafeLoad
<uint32_t>(in
+ 5) << 16, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 20 | SafeLoad
<uint32_t>(in
+ 6) << 12, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 24 | SafeLoad
<uint32_t>(in
+ 7) << 8, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8) };
728 shifts
= simd_batch
{ 0, 2, 0, 6, 0, 10, 0, 14 };
729 results
= (words
>> shifts
) & masks
;
730 results
.store_unaligned(out
);
733 // extract 18-bit bundles 16 to 23
734 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 18 | SafeLoad
<uint32_t>(in
+ 10) << 14, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 22 | SafeLoad
<uint32_t>(in
+ 11) << 10, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 26 | SafeLoad
<uint32_t>(in
+ 12) << 6, SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) >> 30 | SafeLoad
<uint32_t>(in
+ 13) << 2 };
735 shifts
= simd_batch
{ 0, 0, 4, 0, 8, 0, 12, 0 };
736 results
= (words
>> shifts
) & masks
;
737 results
.store_unaligned(out
);
740 // extract 18-bit bundles 24 to 31
741 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 13) >> 16 | SafeLoad
<uint32_t>(in
+ 14) << 16, SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) >> 20 | SafeLoad
<uint32_t>(in
+ 15) << 12, SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 24 | SafeLoad
<uint32_t>(in
+ 16) << 8, SafeLoad
<uint32_t>(in
+ 16), SafeLoad
<uint32_t>(in
+ 16) >> 28 | SafeLoad
<uint32_t>(in
+ 17) << 4, SafeLoad
<uint32_t>(in
+ 17) };
742 shifts
= simd_batch
{ 0, 2, 0, 6, 0, 10, 0, 14 };
743 results
= (words
>> shifts
) & masks
;
744 results
.store_unaligned(out
);
751 inline static const uint32_t* unpack19_32(const uint32_t* in
, uint32_t* out
) {
752 uint32_t mask
= 0x7ffff;
754 simd_batch
masks(mask
);
755 simd_batch words
, shifts
;
758 // extract 19-bit bundles 0 to 7
759 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 19 | SafeLoad
<uint32_t>(in
+ 1) << 13, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 25 | SafeLoad
<uint32_t>(in
+ 2) << 7, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 31 | SafeLoad
<uint32_t>(in
+ 3) << 1, SafeLoad
<uint32_t>(in
+ 3) >> 18 | SafeLoad
<uint32_t>(in
+ 4) << 14, SafeLoad
<uint32_t>(in
+ 4) };
760 shifts
= simd_batch
{ 0, 0, 6, 0, 12, 0, 0, 5 };
761 results
= (words
>> shifts
) & masks
;
762 results
.store_unaligned(out
);
765 // extract 19-bit bundles 8 to 15
766 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 4) >> 24 | SafeLoad
<uint32_t>(in
+ 5) << 8, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 30 | SafeLoad
<uint32_t>(in
+ 6) << 2, SafeLoad
<uint32_t>(in
+ 6) >> 17 | SafeLoad
<uint32_t>(in
+ 7) << 15, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 23 | SafeLoad
<uint32_t>(in
+ 8) << 9, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 29 | SafeLoad
<uint32_t>(in
+ 9) << 3 };
767 shifts
= simd_batch
{ 0, 11, 0, 0, 4, 0, 10, 0 };
768 results
= (words
>> shifts
) & masks
;
769 results
.store_unaligned(out
);
772 // extract 19-bit bundles 16 to 23
773 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 9) >> 16 | SafeLoad
<uint32_t>(in
+ 10) << 16, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 22 | SafeLoad
<uint32_t>(in
+ 11) << 10, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 28 | SafeLoad
<uint32_t>(in
+ 12) << 4, SafeLoad
<uint32_t>(in
+ 12) >> 15 | SafeLoad
<uint32_t>(in
+ 13) << 17, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 21 | SafeLoad
<uint32_t>(in
+ 14) << 11 };
774 shifts
= simd_batch
{ 0, 3, 0, 9, 0, 0, 2, 0 };
775 results
= (words
>> shifts
) & masks
;
776 results
.store_unaligned(out
);
779 // extract 19-bit bundles 24 to 31
780 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) >> 27 | SafeLoad
<uint32_t>(in
+ 15) << 5, SafeLoad
<uint32_t>(in
+ 15) >> 14 | SafeLoad
<uint32_t>(in
+ 16) << 18, SafeLoad
<uint32_t>(in
+ 16), SafeLoad
<uint32_t>(in
+ 16) >> 20 | SafeLoad
<uint32_t>(in
+ 17) << 12, SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 26 | SafeLoad
<uint32_t>(in
+ 18) << 6, SafeLoad
<uint32_t>(in
+ 18) };
781 shifts
= simd_batch
{ 8, 0, 0, 1, 0, 7, 0, 13 };
782 results
= (words
>> shifts
) & masks
;
783 results
.store_unaligned(out
);
790 inline static const uint32_t* unpack20_32(const uint32_t* in
, uint32_t* out
) {
791 uint32_t mask
= 0xfffff;
793 simd_batch
masks(mask
);
794 simd_batch words
, shifts
;
797 // extract 20-bit bundles 0 to 7
798 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 20 | SafeLoad
<uint32_t>(in
+ 1) << 12, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2) >> 16 | SafeLoad
<uint32_t>(in
+ 3) << 16, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4) };
799 shifts
= simd_batch
{ 0, 0, 8, 0, 0, 4, 0, 12 };
800 results
= (words
>> shifts
) & masks
;
801 results
.store_unaligned(out
);
804 // extract 20-bit bundles 8 to 15
805 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 20 | SafeLoad
<uint32_t>(in
+ 6) << 12, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 28 | SafeLoad
<uint32_t>(in
+ 7) << 4, SafeLoad
<uint32_t>(in
+ 7) >> 16 | SafeLoad
<uint32_t>(in
+ 8) << 16, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 24 | SafeLoad
<uint32_t>(in
+ 9) << 8, SafeLoad
<uint32_t>(in
+ 9) };
806 shifts
= simd_batch
{ 0, 0, 8, 0, 0, 4, 0, 12 };
807 results
= (words
>> shifts
) & masks
;
808 results
.store_unaligned(out
);
811 // extract 20-bit bundles 16 to 23
812 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 20 | SafeLoad
<uint32_t>(in
+ 11) << 12, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 28 | SafeLoad
<uint32_t>(in
+ 12) << 4, SafeLoad
<uint32_t>(in
+ 12) >> 16 | SafeLoad
<uint32_t>(in
+ 13) << 16, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 24 | SafeLoad
<uint32_t>(in
+ 14) << 8, SafeLoad
<uint32_t>(in
+ 14) };
813 shifts
= simd_batch
{ 0, 0, 8, 0, 0, 4, 0, 12 };
814 results
= (words
>> shifts
) & masks
;
815 results
.store_unaligned(out
);
818 // extract 20-bit bundles 24 to 31
819 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 20 | SafeLoad
<uint32_t>(in
+ 16) << 12, SafeLoad
<uint32_t>(in
+ 16), SafeLoad
<uint32_t>(in
+ 16) >> 28 | SafeLoad
<uint32_t>(in
+ 17) << 4, SafeLoad
<uint32_t>(in
+ 17) >> 16 | SafeLoad
<uint32_t>(in
+ 18) << 16, SafeLoad
<uint32_t>(in
+ 18), SafeLoad
<uint32_t>(in
+ 18) >> 24 | SafeLoad
<uint32_t>(in
+ 19) << 8, SafeLoad
<uint32_t>(in
+ 19) };
820 shifts
= simd_batch
{ 0, 0, 8, 0, 0, 4, 0, 12 };
821 results
= (words
>> shifts
) & masks
;
822 results
.store_unaligned(out
);
829 inline static const uint32_t* unpack21_32(const uint32_t* in
, uint32_t* out
) {
830 uint32_t mask
= 0x1fffff;
832 simd_batch
masks(mask
);
833 simd_batch words
, shifts
;
836 // extract 21-bit bundles 0 to 7
837 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 21 | SafeLoad
<uint32_t>(in
+ 1) << 11, SafeLoad
<uint32_t>(in
+ 1), SafeLoad
<uint32_t>(in
+ 1) >> 31 | SafeLoad
<uint32_t>(in
+ 2) << 1, SafeLoad
<uint32_t>(in
+ 2) >> 20 | SafeLoad
<uint32_t>(in
+ 3) << 12, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 30 | SafeLoad
<uint32_t>(in
+ 4) << 2, SafeLoad
<uint32_t>(in
+ 4) >> 19 | SafeLoad
<uint32_t>(in
+ 5) << 13 };
838 shifts
= simd_batch
{ 0, 0, 10, 0, 0, 9, 0, 0 };
839 results
= (words
>> shifts
) & masks
;
840 results
.store_unaligned(out
);
843 // extract 21-bit bundles 8 to 15
844 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 29 | SafeLoad
<uint32_t>(in
+ 6) << 3, SafeLoad
<uint32_t>(in
+ 6) >> 18 | SafeLoad
<uint32_t>(in
+ 7) << 14, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8) >> 17 | SafeLoad
<uint32_t>(in
+ 9) << 15, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 27 | SafeLoad
<uint32_t>(in
+ 10) << 5 };
845 shifts
= simd_batch
{ 8, 0, 0, 7, 0, 0, 6, 0 };
846 results
= (words
>> shifts
) & masks
;
847 results
.store_unaligned(out
);
850 // extract 21-bit bundles 16 to 23
851 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 10) >> 16 | SafeLoad
<uint32_t>(in
+ 11) << 16, SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 26 | SafeLoad
<uint32_t>(in
+ 12) << 6, SafeLoad
<uint32_t>(in
+ 12) >> 15 | SafeLoad
<uint32_t>(in
+ 13) << 17, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 25 | SafeLoad
<uint32_t>(in
+ 14) << 7, SafeLoad
<uint32_t>(in
+ 14) >> 14 | SafeLoad
<uint32_t>(in
+ 15) << 18, SafeLoad
<uint32_t>(in
+ 15) };
852 shifts
= simd_batch
{ 0, 5, 0, 0, 4, 0, 0, 3 };
853 results
= (words
>> shifts
) & masks
;
854 results
.store_unaligned(out
);
857 // extract 21-bit bundles 24 to 31
858 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 15) >> 24 | SafeLoad
<uint32_t>(in
+ 16) << 8, SafeLoad
<uint32_t>(in
+ 16) >> 13 | SafeLoad
<uint32_t>(in
+ 17) << 19, SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 23 | SafeLoad
<uint32_t>(in
+ 18) << 9, SafeLoad
<uint32_t>(in
+ 18) >> 12 | SafeLoad
<uint32_t>(in
+ 19) << 20, SafeLoad
<uint32_t>(in
+ 19), SafeLoad
<uint32_t>(in
+ 19) >> 22 | SafeLoad
<uint32_t>(in
+ 20) << 10, SafeLoad
<uint32_t>(in
+ 20) };
859 shifts
= simd_batch
{ 0, 0, 2, 0, 0, 1, 0, 11 };
860 results
= (words
>> shifts
) & masks
;
861 results
.store_unaligned(out
);
868 inline static const uint32_t* unpack22_32(const uint32_t* in
, uint32_t* out
) {
869 uint32_t mask
= 0x3fffff;
871 simd_batch
masks(mask
);
872 simd_batch words
, shifts
;
875 // extract 22-bit bundles 0 to 7
876 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 22 | SafeLoad
<uint32_t>(in
+ 1) << 10, SafeLoad
<uint32_t>(in
+ 1) >> 12 | SafeLoad
<uint32_t>(in
+ 2) << 20, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 24 | SafeLoad
<uint32_t>(in
+ 3) << 8, SafeLoad
<uint32_t>(in
+ 3) >> 14 | SafeLoad
<uint32_t>(in
+ 4) << 18, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 26 | SafeLoad
<uint32_t>(in
+ 5) << 6 };
877 shifts
= simd_batch
{ 0, 0, 0, 2, 0, 0, 4, 0 };
878 results
= (words
>> shifts
) & masks
;
879 results
.store_unaligned(out
);
882 // extract 22-bit bundles 8 to 15
883 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5) >> 16 | SafeLoad
<uint32_t>(in
+ 6) << 16, SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 28 | SafeLoad
<uint32_t>(in
+ 7) << 4, SafeLoad
<uint32_t>(in
+ 7) >> 18 | SafeLoad
<uint32_t>(in
+ 8) << 14, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 30 | SafeLoad
<uint32_t>(in
+ 9) << 2, SafeLoad
<uint32_t>(in
+ 9) >> 20 | SafeLoad
<uint32_t>(in
+ 10) << 12, SafeLoad
<uint32_t>(in
+ 10) };
884 shifts
= simd_batch
{ 0, 6, 0, 0, 8, 0, 0, 10 };
885 results
= (words
>> shifts
) & masks
;
886 results
.store_unaligned(out
);
889 // extract 22-bit bundles 16 to 23
890 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 11), SafeLoad
<uint32_t>(in
+ 11) >> 22 | SafeLoad
<uint32_t>(in
+ 12) << 10, SafeLoad
<uint32_t>(in
+ 12) >> 12 | SafeLoad
<uint32_t>(in
+ 13) << 20, SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 24 | SafeLoad
<uint32_t>(in
+ 14) << 8, SafeLoad
<uint32_t>(in
+ 14) >> 14 | SafeLoad
<uint32_t>(in
+ 15) << 18, SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 26 | SafeLoad
<uint32_t>(in
+ 16) << 6 };
891 shifts
= simd_batch
{ 0, 0, 0, 2, 0, 0, 4, 0 };
892 results
= (words
>> shifts
) & masks
;
893 results
.store_unaligned(out
);
896 // extract 22-bit bundles 24 to 31
897 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 16) >> 16 | SafeLoad
<uint32_t>(in
+ 17) << 16, SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 28 | SafeLoad
<uint32_t>(in
+ 18) << 4, SafeLoad
<uint32_t>(in
+ 18) >> 18 | SafeLoad
<uint32_t>(in
+ 19) << 14, SafeLoad
<uint32_t>(in
+ 19), SafeLoad
<uint32_t>(in
+ 19) >> 30 | SafeLoad
<uint32_t>(in
+ 20) << 2, SafeLoad
<uint32_t>(in
+ 20) >> 20 | SafeLoad
<uint32_t>(in
+ 21) << 12, SafeLoad
<uint32_t>(in
+ 21) };
898 shifts
= simd_batch
{ 0, 6, 0, 0, 8, 0, 0, 10 };
899 results
= (words
>> shifts
) & masks
;
900 results
.store_unaligned(out
);
907 inline static const uint32_t* unpack23_32(const uint32_t* in
, uint32_t* out
) {
908 uint32_t mask
= 0x7fffff;
910 simd_batch
masks(mask
);
911 simd_batch words
, shifts
;
914 // extract 23-bit bundles 0 to 7
915 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 23 | SafeLoad
<uint32_t>(in
+ 1) << 9, SafeLoad
<uint32_t>(in
+ 1) >> 14 | SafeLoad
<uint32_t>(in
+ 2) << 18, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 2) >> 28 | SafeLoad
<uint32_t>(in
+ 3) << 4, SafeLoad
<uint32_t>(in
+ 3) >> 19 | SafeLoad
<uint32_t>(in
+ 4) << 13, SafeLoad
<uint32_t>(in
+ 4) >> 10 | SafeLoad
<uint32_t>(in
+ 5) << 22, SafeLoad
<uint32_t>(in
+ 5) };
916 shifts
= simd_batch
{ 0, 0, 0, 5, 0, 0, 0, 1 };
917 results
= (words
>> shifts
) & masks
;
918 results
.store_unaligned(out
);
921 // extract 23-bit bundles 8 to 15
922 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 5) >> 24 | SafeLoad
<uint32_t>(in
+ 6) << 8, SafeLoad
<uint32_t>(in
+ 6) >> 15 | SafeLoad
<uint32_t>(in
+ 7) << 17, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 29 | SafeLoad
<uint32_t>(in
+ 8) << 3, SafeLoad
<uint32_t>(in
+ 8) >> 20 | SafeLoad
<uint32_t>(in
+ 9) << 12, SafeLoad
<uint32_t>(in
+ 9) >> 11 | SafeLoad
<uint32_t>(in
+ 10) << 21, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 25 | SafeLoad
<uint32_t>(in
+ 11) << 7 };
923 shifts
= simd_batch
{ 0, 0, 6, 0, 0, 0, 2, 0 };
924 results
= (words
>> shifts
) & masks
;
925 results
.store_unaligned(out
);
928 // extract 23-bit bundles 16 to 23
929 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 11) >> 16 | SafeLoad
<uint32_t>(in
+ 12) << 16, SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) >> 30 | SafeLoad
<uint32_t>(in
+ 13) << 2, SafeLoad
<uint32_t>(in
+ 13) >> 21 | SafeLoad
<uint32_t>(in
+ 14) << 11, SafeLoad
<uint32_t>(in
+ 14) >> 12 | SafeLoad
<uint32_t>(in
+ 15) << 20, SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 26 | SafeLoad
<uint32_t>(in
+ 16) << 6, SafeLoad
<uint32_t>(in
+ 16) >> 17 | SafeLoad
<uint32_t>(in
+ 17) << 15 };
930 shifts
= simd_batch
{ 0, 7, 0, 0, 0, 3, 0, 0 };
931 results
= (words
>> shifts
) & masks
;
932 results
.store_unaligned(out
);
935 // extract 23-bit bundles 24 to 31
936 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 31 | SafeLoad
<uint32_t>(in
+ 18) << 1, SafeLoad
<uint32_t>(in
+ 18) >> 22 | SafeLoad
<uint32_t>(in
+ 19) << 10, SafeLoad
<uint32_t>(in
+ 19) >> 13 | SafeLoad
<uint32_t>(in
+ 20) << 19, SafeLoad
<uint32_t>(in
+ 20), SafeLoad
<uint32_t>(in
+ 20) >> 27 | SafeLoad
<uint32_t>(in
+ 21) << 5, SafeLoad
<uint32_t>(in
+ 21) >> 18 | SafeLoad
<uint32_t>(in
+ 22) << 14, SafeLoad
<uint32_t>(in
+ 22) };
937 shifts
= simd_batch
{ 8, 0, 0, 0, 4, 0, 0, 9 };
938 results
= (words
>> shifts
) & masks
;
939 results
.store_unaligned(out
);
946 inline static const uint32_t* unpack24_32(const uint32_t* in
, uint32_t* out
) {
947 uint32_t mask
= 0xffffff;
949 simd_batch
masks(mask
);
950 simd_batch words
, shifts
;
953 // extract 24-bit bundles 0 to 7
954 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 24 | SafeLoad
<uint32_t>(in
+ 1) << 8, SafeLoad
<uint32_t>(in
+ 1) >> 16 | SafeLoad
<uint32_t>(in
+ 2) << 16, SafeLoad
<uint32_t>(in
+ 2), SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4) >> 16 | SafeLoad
<uint32_t>(in
+ 5) << 16, SafeLoad
<uint32_t>(in
+ 5) };
955 shifts
= simd_batch
{ 0, 0, 0, 8, 0, 0, 0, 8 };
956 results
= (words
>> shifts
) & masks
;
957 results
.store_unaligned(out
);
960 // extract 24-bit bundles 8 to 15
961 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6), SafeLoad
<uint32_t>(in
+ 6) >> 24 | SafeLoad
<uint32_t>(in
+ 7) << 8, SafeLoad
<uint32_t>(in
+ 7) >> 16 | SafeLoad
<uint32_t>(in
+ 8) << 16, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 24 | SafeLoad
<uint32_t>(in
+ 10) << 8, SafeLoad
<uint32_t>(in
+ 10) >> 16 | SafeLoad
<uint32_t>(in
+ 11) << 16, SafeLoad
<uint32_t>(in
+ 11) };
962 shifts
= simd_batch
{ 0, 0, 0, 8, 0, 0, 0, 8 };
963 results
= (words
>> shifts
) & masks
;
964 results
.store_unaligned(out
);
967 // extract 24-bit bundles 16 to 23
968 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 12), SafeLoad
<uint32_t>(in
+ 12) >> 24 | SafeLoad
<uint32_t>(in
+ 13) << 8, SafeLoad
<uint32_t>(in
+ 13) >> 16 | SafeLoad
<uint32_t>(in
+ 14) << 16, SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 24 | SafeLoad
<uint32_t>(in
+ 16) << 8, SafeLoad
<uint32_t>(in
+ 16) >> 16 | SafeLoad
<uint32_t>(in
+ 17) << 16, SafeLoad
<uint32_t>(in
+ 17) };
969 shifts
= simd_batch
{ 0, 0, 0, 8, 0, 0, 0, 8 };
970 results
= (words
>> shifts
) & masks
;
971 results
.store_unaligned(out
);
974 // extract 24-bit bundles 24 to 31
975 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 18), SafeLoad
<uint32_t>(in
+ 18) >> 24 | SafeLoad
<uint32_t>(in
+ 19) << 8, SafeLoad
<uint32_t>(in
+ 19) >> 16 | SafeLoad
<uint32_t>(in
+ 20) << 16, SafeLoad
<uint32_t>(in
+ 20), SafeLoad
<uint32_t>(in
+ 21), SafeLoad
<uint32_t>(in
+ 21) >> 24 | SafeLoad
<uint32_t>(in
+ 22) << 8, SafeLoad
<uint32_t>(in
+ 22) >> 16 | SafeLoad
<uint32_t>(in
+ 23) << 16, SafeLoad
<uint32_t>(in
+ 23) };
976 shifts
= simd_batch
{ 0, 0, 0, 8, 0, 0, 0, 8 };
977 results
= (words
>> shifts
) & masks
;
978 results
.store_unaligned(out
);
985 inline static const uint32_t* unpack25_32(const uint32_t* in
, uint32_t* out
) {
986 uint32_t mask
= 0x1ffffff;
988 simd_batch
masks(mask
);
989 simd_batch words
, shifts
;
992 // extract 25-bit bundles 0 to 7
993 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 25 | SafeLoad
<uint32_t>(in
+ 1) << 7, SafeLoad
<uint32_t>(in
+ 1) >> 18 | SafeLoad
<uint32_t>(in
+ 2) << 14, SafeLoad
<uint32_t>(in
+ 2) >> 11 | SafeLoad
<uint32_t>(in
+ 3) << 21, SafeLoad
<uint32_t>(in
+ 3), SafeLoad
<uint32_t>(in
+ 3) >> 29 | SafeLoad
<uint32_t>(in
+ 4) << 3, SafeLoad
<uint32_t>(in
+ 4) >> 22 | SafeLoad
<uint32_t>(in
+ 5) << 10, SafeLoad
<uint32_t>(in
+ 5) >> 15 | SafeLoad
<uint32_t>(in
+ 6) << 17 };
994 shifts
= simd_batch
{ 0, 0, 0, 0, 4, 0, 0, 0 };
995 results
= (words
>> shifts
) & masks
;
996 results
.store_unaligned(out
);
999 // extract 25-bit bundles 8 to 15
1000 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6) >> 8 | SafeLoad
<uint32_t>(in
+ 7) << 24, SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 26 | SafeLoad
<uint32_t>(in
+ 8) << 6, SafeLoad
<uint32_t>(in
+ 8) >> 19 | SafeLoad
<uint32_t>(in
+ 9) << 13, SafeLoad
<uint32_t>(in
+ 9) >> 12 | SafeLoad
<uint32_t>(in
+ 10) << 20, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 30 | SafeLoad
<uint32_t>(in
+ 11) << 2, SafeLoad
<uint32_t>(in
+ 11) >> 23 | SafeLoad
<uint32_t>(in
+ 12) << 9 };
1001 shifts
= simd_batch
{ 0, 1, 0, 0, 0, 5, 0, 0 };
1002 results
= (words
>> shifts
) & masks
;
1003 results
.store_unaligned(out
);
1006 // extract 25-bit bundles 16 to 23
1007 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 12) >> 16 | SafeLoad
<uint32_t>(in
+ 13) << 16, SafeLoad
<uint32_t>(in
+ 13) >> 9 | SafeLoad
<uint32_t>(in
+ 14) << 23, SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) >> 27 | SafeLoad
<uint32_t>(in
+ 15) << 5, SafeLoad
<uint32_t>(in
+ 15) >> 20 | SafeLoad
<uint32_t>(in
+ 16) << 12, SafeLoad
<uint32_t>(in
+ 16) >> 13 | SafeLoad
<uint32_t>(in
+ 17) << 19, SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 31 | SafeLoad
<uint32_t>(in
+ 18) << 1 };
1008 shifts
= simd_batch
{ 0, 0, 2, 0, 0, 0, 6, 0 };
1009 results
= (words
>> shifts
) & masks
;
1010 results
.store_unaligned(out
);
1013 // extract 25-bit bundles 24 to 31
1014 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 18) >> 24 | SafeLoad
<uint32_t>(in
+ 19) << 8, SafeLoad
<uint32_t>(in
+ 19) >> 17 | SafeLoad
<uint32_t>(in
+ 20) << 15, SafeLoad
<uint32_t>(in
+ 20) >> 10 | SafeLoad
<uint32_t>(in
+ 21) << 22, SafeLoad
<uint32_t>(in
+ 21), SafeLoad
<uint32_t>(in
+ 21) >> 28 | SafeLoad
<uint32_t>(in
+ 22) << 4, SafeLoad
<uint32_t>(in
+ 22) >> 21 | SafeLoad
<uint32_t>(in
+ 23) << 11, SafeLoad
<uint32_t>(in
+ 23) >> 14 | SafeLoad
<uint32_t>(in
+ 24) << 18, SafeLoad
<uint32_t>(in
+ 24) };
1015 shifts
= simd_batch
{ 0, 0, 0, 3, 0, 0, 0, 7 };
1016 results
= (words
>> shifts
) & masks
;
1017 results
.store_unaligned(out
);
1024 inline static const uint32_t* unpack26_32(const uint32_t* in
, uint32_t* out
) {
1025 uint32_t mask
= 0x3ffffff;
1027 simd_batch
masks(mask
);
1028 simd_batch words
, shifts
;
1031 // extract 26-bit bundles 0 to 7
1032 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 26 | SafeLoad
<uint32_t>(in
+ 1) << 6, SafeLoad
<uint32_t>(in
+ 1) >> 20 | SafeLoad
<uint32_t>(in
+ 2) << 12, SafeLoad
<uint32_t>(in
+ 2) >> 14 | SafeLoad
<uint32_t>(in
+ 3) << 18, SafeLoad
<uint32_t>(in
+ 3) >> 8 | SafeLoad
<uint32_t>(in
+ 4) << 24, SafeLoad
<uint32_t>(in
+ 4), SafeLoad
<uint32_t>(in
+ 4) >> 28 | SafeLoad
<uint32_t>(in
+ 5) << 4, SafeLoad
<uint32_t>(in
+ 5) >> 22 | SafeLoad
<uint32_t>(in
+ 6) << 10 };
1033 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 2, 0, 0 };
1034 results
= (words
>> shifts
) & masks
;
1035 results
.store_unaligned(out
);
1038 // extract 26-bit bundles 8 to 15
1039 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6) >> 16 | SafeLoad
<uint32_t>(in
+ 7) << 16, SafeLoad
<uint32_t>(in
+ 7) >> 10 | SafeLoad
<uint32_t>(in
+ 8) << 22, SafeLoad
<uint32_t>(in
+ 8), SafeLoad
<uint32_t>(in
+ 8) >> 30 | SafeLoad
<uint32_t>(in
+ 9) << 2, SafeLoad
<uint32_t>(in
+ 9) >> 24 | SafeLoad
<uint32_t>(in
+ 10) << 8, SafeLoad
<uint32_t>(in
+ 10) >> 18 | SafeLoad
<uint32_t>(in
+ 11) << 14, SafeLoad
<uint32_t>(in
+ 11) >> 12 | SafeLoad
<uint32_t>(in
+ 12) << 20, SafeLoad
<uint32_t>(in
+ 12) };
1040 shifts
= simd_batch
{ 0, 0, 4, 0, 0, 0, 0, 6 };
1041 results
= (words
>> shifts
) & masks
;
1042 results
.store_unaligned(out
);
1045 // extract 26-bit bundles 16 to 23
1046 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 13), SafeLoad
<uint32_t>(in
+ 13) >> 26 | SafeLoad
<uint32_t>(in
+ 14) << 6, SafeLoad
<uint32_t>(in
+ 14) >> 20 | SafeLoad
<uint32_t>(in
+ 15) << 12, SafeLoad
<uint32_t>(in
+ 15) >> 14 | SafeLoad
<uint32_t>(in
+ 16) << 18, SafeLoad
<uint32_t>(in
+ 16) >> 8 | SafeLoad
<uint32_t>(in
+ 17) << 24, SafeLoad
<uint32_t>(in
+ 17), SafeLoad
<uint32_t>(in
+ 17) >> 28 | SafeLoad
<uint32_t>(in
+ 18) << 4, SafeLoad
<uint32_t>(in
+ 18) >> 22 | SafeLoad
<uint32_t>(in
+ 19) << 10 };
1047 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 2, 0, 0 };
1048 results
= (words
>> shifts
) & masks
;
1049 results
.store_unaligned(out
);
1052 // extract 26-bit bundles 24 to 31
1053 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 19) >> 16 | SafeLoad
<uint32_t>(in
+ 20) << 16, SafeLoad
<uint32_t>(in
+ 20) >> 10 | SafeLoad
<uint32_t>(in
+ 21) << 22, SafeLoad
<uint32_t>(in
+ 21), SafeLoad
<uint32_t>(in
+ 21) >> 30 | SafeLoad
<uint32_t>(in
+ 22) << 2, SafeLoad
<uint32_t>(in
+ 22) >> 24 | SafeLoad
<uint32_t>(in
+ 23) << 8, SafeLoad
<uint32_t>(in
+ 23) >> 18 | SafeLoad
<uint32_t>(in
+ 24) << 14, SafeLoad
<uint32_t>(in
+ 24) >> 12 | SafeLoad
<uint32_t>(in
+ 25) << 20, SafeLoad
<uint32_t>(in
+ 25) };
1054 shifts
= simd_batch
{ 0, 0, 4, 0, 0, 0, 0, 6 };
1055 results
= (words
>> shifts
) & masks
;
1056 results
.store_unaligned(out
);
1063 inline static const uint32_t* unpack27_32(const uint32_t* in
, uint32_t* out
) {
1064 uint32_t mask
= 0x7ffffff;
1066 simd_batch
masks(mask
);
1067 simd_batch words
, shifts
;
1070 // extract 27-bit bundles 0 to 7
1071 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 27 | SafeLoad
<uint32_t>(in
+ 1) << 5, SafeLoad
<uint32_t>(in
+ 1) >> 22 | SafeLoad
<uint32_t>(in
+ 2) << 10, SafeLoad
<uint32_t>(in
+ 2) >> 17 | SafeLoad
<uint32_t>(in
+ 3) << 15, SafeLoad
<uint32_t>(in
+ 3) >> 12 | SafeLoad
<uint32_t>(in
+ 4) << 20, SafeLoad
<uint32_t>(in
+ 4) >> 7 | SafeLoad
<uint32_t>(in
+ 5) << 25, SafeLoad
<uint32_t>(in
+ 5), SafeLoad
<uint32_t>(in
+ 5) >> 29 | SafeLoad
<uint32_t>(in
+ 6) << 3 };
1072 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 2, 0 };
1073 results
= (words
>> shifts
) & masks
;
1074 results
.store_unaligned(out
);
1077 // extract 27-bit bundles 8 to 15
1078 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 6) >> 24 | SafeLoad
<uint32_t>(in
+ 7) << 8, SafeLoad
<uint32_t>(in
+ 7) >> 19 | SafeLoad
<uint32_t>(in
+ 8) << 13, SafeLoad
<uint32_t>(in
+ 8) >> 14 | SafeLoad
<uint32_t>(in
+ 9) << 18, SafeLoad
<uint32_t>(in
+ 9) >> 9 | SafeLoad
<uint32_t>(in
+ 10) << 23, SafeLoad
<uint32_t>(in
+ 10), SafeLoad
<uint32_t>(in
+ 10) >> 31 | SafeLoad
<uint32_t>(in
+ 11) << 1, SafeLoad
<uint32_t>(in
+ 11) >> 26 | SafeLoad
<uint32_t>(in
+ 12) << 6, SafeLoad
<uint32_t>(in
+ 12) >> 21 | SafeLoad
<uint32_t>(in
+ 13) << 11 };
1079 shifts
= simd_batch
{ 0, 0, 0, 0, 4, 0, 0, 0 };
1080 results
= (words
>> shifts
) & masks
;
1081 results
.store_unaligned(out
);
1084 // extract 27-bit bundles 16 to 23
1085 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 13) >> 16 | SafeLoad
<uint32_t>(in
+ 14) << 16, SafeLoad
<uint32_t>(in
+ 14) >> 11 | SafeLoad
<uint32_t>(in
+ 15) << 21, SafeLoad
<uint32_t>(in
+ 15) >> 6 | SafeLoad
<uint32_t>(in
+ 16) << 26, SafeLoad
<uint32_t>(in
+ 16), SafeLoad
<uint32_t>(in
+ 16) >> 28 | SafeLoad
<uint32_t>(in
+ 17) << 4, SafeLoad
<uint32_t>(in
+ 17) >> 23 | SafeLoad
<uint32_t>(in
+ 18) << 9, SafeLoad
<uint32_t>(in
+ 18) >> 18 | SafeLoad
<uint32_t>(in
+ 19) << 14, SafeLoad
<uint32_t>(in
+ 19) >> 13 | SafeLoad
<uint32_t>(in
+ 20) << 19 };
1086 shifts
= simd_batch
{ 0, 0, 0, 1, 0, 0, 0, 0 };
1087 results
= (words
>> shifts
) & masks
;
1088 results
.store_unaligned(out
);
1091 // extract 27-bit bundles 24 to 31
1092 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 20) >> 8 | SafeLoad
<uint32_t>(in
+ 21) << 24, SafeLoad
<uint32_t>(in
+ 21), SafeLoad
<uint32_t>(in
+ 21) >> 30 | SafeLoad
<uint32_t>(in
+ 22) << 2, SafeLoad
<uint32_t>(in
+ 22) >> 25 | SafeLoad
<uint32_t>(in
+ 23) << 7, SafeLoad
<uint32_t>(in
+ 23) >> 20 | SafeLoad
<uint32_t>(in
+ 24) << 12, SafeLoad
<uint32_t>(in
+ 24) >> 15 | SafeLoad
<uint32_t>(in
+ 25) << 17, SafeLoad
<uint32_t>(in
+ 25) >> 10 | SafeLoad
<uint32_t>(in
+ 26) << 22, SafeLoad
<uint32_t>(in
+ 26) };
1093 shifts
= simd_batch
{ 0, 3, 0, 0, 0, 0, 0, 5 };
1094 results
= (words
>> shifts
) & masks
;
1095 results
.store_unaligned(out
);
1102 inline static const uint32_t* unpack28_32(const uint32_t* in
, uint32_t* out
) {
1103 uint32_t mask
= 0xfffffff;
1105 simd_batch
masks(mask
);
1106 simd_batch words
, shifts
;
1109 // extract 28-bit bundles 0 to 7
1110 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 28 | SafeLoad
<uint32_t>(in
+ 1) << 4, SafeLoad
<uint32_t>(in
+ 1) >> 24 | SafeLoad
<uint32_t>(in
+ 2) << 8, SafeLoad
<uint32_t>(in
+ 2) >> 20 | SafeLoad
<uint32_t>(in
+ 3) << 12, SafeLoad
<uint32_t>(in
+ 3) >> 16 | SafeLoad
<uint32_t>(in
+ 4) << 16, SafeLoad
<uint32_t>(in
+ 4) >> 12 | SafeLoad
<uint32_t>(in
+ 5) << 20, SafeLoad
<uint32_t>(in
+ 5) >> 8 | SafeLoad
<uint32_t>(in
+ 6) << 24, SafeLoad
<uint32_t>(in
+ 6) };
1111 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 4 };
1112 results
= (words
>> shifts
) & masks
;
1113 results
.store_unaligned(out
);
1116 // extract 28-bit bundles 8 to 15
1117 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7), SafeLoad
<uint32_t>(in
+ 7) >> 28 | SafeLoad
<uint32_t>(in
+ 8) << 4, SafeLoad
<uint32_t>(in
+ 8) >> 24 | SafeLoad
<uint32_t>(in
+ 9) << 8, SafeLoad
<uint32_t>(in
+ 9) >> 20 | SafeLoad
<uint32_t>(in
+ 10) << 12, SafeLoad
<uint32_t>(in
+ 10) >> 16 | SafeLoad
<uint32_t>(in
+ 11) << 16, SafeLoad
<uint32_t>(in
+ 11) >> 12 | SafeLoad
<uint32_t>(in
+ 12) << 20, SafeLoad
<uint32_t>(in
+ 12) >> 8 | SafeLoad
<uint32_t>(in
+ 13) << 24, SafeLoad
<uint32_t>(in
+ 13) };
1118 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 4 };
1119 results
= (words
>> shifts
) & masks
;
1120 results
.store_unaligned(out
);
1123 // extract 28-bit bundles 16 to 23
1124 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 14), SafeLoad
<uint32_t>(in
+ 14) >> 28 | SafeLoad
<uint32_t>(in
+ 15) << 4, SafeLoad
<uint32_t>(in
+ 15) >> 24 | SafeLoad
<uint32_t>(in
+ 16) << 8, SafeLoad
<uint32_t>(in
+ 16) >> 20 | SafeLoad
<uint32_t>(in
+ 17) << 12, SafeLoad
<uint32_t>(in
+ 17) >> 16 | SafeLoad
<uint32_t>(in
+ 18) << 16, SafeLoad
<uint32_t>(in
+ 18) >> 12 | SafeLoad
<uint32_t>(in
+ 19) << 20, SafeLoad
<uint32_t>(in
+ 19) >> 8 | SafeLoad
<uint32_t>(in
+ 20) << 24, SafeLoad
<uint32_t>(in
+ 20) };
1125 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 4 };
1126 results
= (words
>> shifts
) & masks
;
1127 results
.store_unaligned(out
);
1130 // extract 28-bit bundles 24 to 31
1131 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 21), SafeLoad
<uint32_t>(in
+ 21) >> 28 | SafeLoad
<uint32_t>(in
+ 22) << 4, SafeLoad
<uint32_t>(in
+ 22) >> 24 | SafeLoad
<uint32_t>(in
+ 23) << 8, SafeLoad
<uint32_t>(in
+ 23) >> 20 | SafeLoad
<uint32_t>(in
+ 24) << 12, SafeLoad
<uint32_t>(in
+ 24) >> 16 | SafeLoad
<uint32_t>(in
+ 25) << 16, SafeLoad
<uint32_t>(in
+ 25) >> 12 | SafeLoad
<uint32_t>(in
+ 26) << 20, SafeLoad
<uint32_t>(in
+ 26) >> 8 | SafeLoad
<uint32_t>(in
+ 27) << 24, SafeLoad
<uint32_t>(in
+ 27) };
1132 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 4 };
1133 results
= (words
>> shifts
) & masks
;
1134 results
.store_unaligned(out
);
1141 inline static const uint32_t* unpack29_32(const uint32_t* in
, uint32_t* out
) {
1142 uint32_t mask
= 0x1fffffff;
1144 simd_batch
masks(mask
);
1145 simd_batch words
, shifts
;
1148 // extract 29-bit bundles 0 to 7
1149 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 29 | SafeLoad
<uint32_t>(in
+ 1) << 3, SafeLoad
<uint32_t>(in
+ 1) >> 26 | SafeLoad
<uint32_t>(in
+ 2) << 6, SafeLoad
<uint32_t>(in
+ 2) >> 23 | SafeLoad
<uint32_t>(in
+ 3) << 9, SafeLoad
<uint32_t>(in
+ 3) >> 20 | SafeLoad
<uint32_t>(in
+ 4) << 12, SafeLoad
<uint32_t>(in
+ 4) >> 17 | SafeLoad
<uint32_t>(in
+ 5) << 15, SafeLoad
<uint32_t>(in
+ 5) >> 14 | SafeLoad
<uint32_t>(in
+ 6) << 18, SafeLoad
<uint32_t>(in
+ 6) >> 11 | SafeLoad
<uint32_t>(in
+ 7) << 21 };
1150 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1151 results
= (words
>> shifts
) & masks
;
1152 results
.store_unaligned(out
);
1155 // extract 29-bit bundles 8 to 15
1156 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7) >> 8 | SafeLoad
<uint32_t>(in
+ 8) << 24, SafeLoad
<uint32_t>(in
+ 8) >> 5 | SafeLoad
<uint32_t>(in
+ 9) << 27, SafeLoad
<uint32_t>(in
+ 9), SafeLoad
<uint32_t>(in
+ 9) >> 31 | SafeLoad
<uint32_t>(in
+ 10) << 1, SafeLoad
<uint32_t>(in
+ 10) >> 28 | SafeLoad
<uint32_t>(in
+ 11) << 4, SafeLoad
<uint32_t>(in
+ 11) >> 25 | SafeLoad
<uint32_t>(in
+ 12) << 7, SafeLoad
<uint32_t>(in
+ 12) >> 22 | SafeLoad
<uint32_t>(in
+ 13) << 10, SafeLoad
<uint32_t>(in
+ 13) >> 19 | SafeLoad
<uint32_t>(in
+ 14) << 13 };
1157 shifts
= simd_batch
{ 0, 0, 2, 0, 0, 0, 0, 0 };
1158 results
= (words
>> shifts
) & masks
;
1159 results
.store_unaligned(out
);
1162 // extract 29-bit bundles 16 to 23
1163 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 14) >> 16 | SafeLoad
<uint32_t>(in
+ 15) << 16, SafeLoad
<uint32_t>(in
+ 15) >> 13 | SafeLoad
<uint32_t>(in
+ 16) << 19, SafeLoad
<uint32_t>(in
+ 16) >> 10 | SafeLoad
<uint32_t>(in
+ 17) << 22, SafeLoad
<uint32_t>(in
+ 17) >> 7 | SafeLoad
<uint32_t>(in
+ 18) << 25, SafeLoad
<uint32_t>(in
+ 18) >> 4 | SafeLoad
<uint32_t>(in
+ 19) << 28, SafeLoad
<uint32_t>(in
+ 19), SafeLoad
<uint32_t>(in
+ 19) >> 30 | SafeLoad
<uint32_t>(in
+ 20) << 2, SafeLoad
<uint32_t>(in
+ 20) >> 27 | SafeLoad
<uint32_t>(in
+ 21) << 5 };
1164 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 1, 0, 0 };
1165 results
= (words
>> shifts
) & masks
;
1166 results
.store_unaligned(out
);
1169 // extract 29-bit bundles 24 to 31
1170 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 21) >> 24 | SafeLoad
<uint32_t>(in
+ 22) << 8, SafeLoad
<uint32_t>(in
+ 22) >> 21 | SafeLoad
<uint32_t>(in
+ 23) << 11, SafeLoad
<uint32_t>(in
+ 23) >> 18 | SafeLoad
<uint32_t>(in
+ 24) << 14, SafeLoad
<uint32_t>(in
+ 24) >> 15 | SafeLoad
<uint32_t>(in
+ 25) << 17, SafeLoad
<uint32_t>(in
+ 25) >> 12 | SafeLoad
<uint32_t>(in
+ 26) << 20, SafeLoad
<uint32_t>(in
+ 26) >> 9 | SafeLoad
<uint32_t>(in
+ 27) << 23, SafeLoad
<uint32_t>(in
+ 27) >> 6 | SafeLoad
<uint32_t>(in
+ 28) << 26, SafeLoad
<uint32_t>(in
+ 28) };
1171 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 3 };
1172 results
= (words
>> shifts
) & masks
;
1173 results
.store_unaligned(out
);
1180 inline static const uint32_t* unpack30_32(const uint32_t* in
, uint32_t* out
) {
1181 uint32_t mask
= 0x3fffffff;
1183 simd_batch
masks(mask
);
1184 simd_batch words
, shifts
;
1187 // extract 30-bit bundles 0 to 7
1188 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 30 | SafeLoad
<uint32_t>(in
+ 1) << 2, SafeLoad
<uint32_t>(in
+ 1) >> 28 | SafeLoad
<uint32_t>(in
+ 2) << 4, SafeLoad
<uint32_t>(in
+ 2) >> 26 | SafeLoad
<uint32_t>(in
+ 3) << 6, SafeLoad
<uint32_t>(in
+ 3) >> 24 | SafeLoad
<uint32_t>(in
+ 4) << 8, SafeLoad
<uint32_t>(in
+ 4) >> 22 | SafeLoad
<uint32_t>(in
+ 5) << 10, SafeLoad
<uint32_t>(in
+ 5) >> 20 | SafeLoad
<uint32_t>(in
+ 6) << 12, SafeLoad
<uint32_t>(in
+ 6) >> 18 | SafeLoad
<uint32_t>(in
+ 7) << 14 };
1189 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1190 results
= (words
>> shifts
) & masks
;
1191 results
.store_unaligned(out
);
1194 // extract 30-bit bundles 8 to 15
1195 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7) >> 16 | SafeLoad
<uint32_t>(in
+ 8) << 16, SafeLoad
<uint32_t>(in
+ 8) >> 14 | SafeLoad
<uint32_t>(in
+ 9) << 18, SafeLoad
<uint32_t>(in
+ 9) >> 12 | SafeLoad
<uint32_t>(in
+ 10) << 20, SafeLoad
<uint32_t>(in
+ 10) >> 10 | SafeLoad
<uint32_t>(in
+ 11) << 22, SafeLoad
<uint32_t>(in
+ 11) >> 8 | SafeLoad
<uint32_t>(in
+ 12) << 24, SafeLoad
<uint32_t>(in
+ 12) >> 6 | SafeLoad
<uint32_t>(in
+ 13) << 26, SafeLoad
<uint32_t>(in
+ 13) >> 4 | SafeLoad
<uint32_t>(in
+ 14) << 28, SafeLoad
<uint32_t>(in
+ 14) };
1196 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 2 };
1197 results
= (words
>> shifts
) & masks
;
1198 results
.store_unaligned(out
);
1201 // extract 30-bit bundles 16 to 23
1202 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 15), SafeLoad
<uint32_t>(in
+ 15) >> 30 | SafeLoad
<uint32_t>(in
+ 16) << 2, SafeLoad
<uint32_t>(in
+ 16) >> 28 | SafeLoad
<uint32_t>(in
+ 17) << 4, SafeLoad
<uint32_t>(in
+ 17) >> 26 | SafeLoad
<uint32_t>(in
+ 18) << 6, SafeLoad
<uint32_t>(in
+ 18) >> 24 | SafeLoad
<uint32_t>(in
+ 19) << 8, SafeLoad
<uint32_t>(in
+ 19) >> 22 | SafeLoad
<uint32_t>(in
+ 20) << 10, SafeLoad
<uint32_t>(in
+ 20) >> 20 | SafeLoad
<uint32_t>(in
+ 21) << 12, SafeLoad
<uint32_t>(in
+ 21) >> 18 | SafeLoad
<uint32_t>(in
+ 22) << 14 };
1203 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1204 results
= (words
>> shifts
) & masks
;
1205 results
.store_unaligned(out
);
1208 // extract 30-bit bundles 24 to 31
1209 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 22) >> 16 | SafeLoad
<uint32_t>(in
+ 23) << 16, SafeLoad
<uint32_t>(in
+ 23) >> 14 | SafeLoad
<uint32_t>(in
+ 24) << 18, SafeLoad
<uint32_t>(in
+ 24) >> 12 | SafeLoad
<uint32_t>(in
+ 25) << 20, SafeLoad
<uint32_t>(in
+ 25) >> 10 | SafeLoad
<uint32_t>(in
+ 26) << 22, SafeLoad
<uint32_t>(in
+ 26) >> 8 | SafeLoad
<uint32_t>(in
+ 27) << 24, SafeLoad
<uint32_t>(in
+ 27) >> 6 | SafeLoad
<uint32_t>(in
+ 28) << 26, SafeLoad
<uint32_t>(in
+ 28) >> 4 | SafeLoad
<uint32_t>(in
+ 29) << 28, SafeLoad
<uint32_t>(in
+ 29) };
1210 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 2 };
1211 results
= (words
>> shifts
) & masks
;
1212 results
.store_unaligned(out
);
1219 inline static const uint32_t* unpack31_32(const uint32_t* in
, uint32_t* out
) {
1220 uint32_t mask
= 0x7fffffff;
1222 simd_batch
masks(mask
);
1223 simd_batch words
, shifts
;
1226 // extract 31-bit bundles 0 to 7
1227 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 0), SafeLoad
<uint32_t>(in
+ 0) >> 31 | SafeLoad
<uint32_t>(in
+ 1) << 1, SafeLoad
<uint32_t>(in
+ 1) >> 30 | SafeLoad
<uint32_t>(in
+ 2) << 2, SafeLoad
<uint32_t>(in
+ 2) >> 29 | SafeLoad
<uint32_t>(in
+ 3) << 3, SafeLoad
<uint32_t>(in
+ 3) >> 28 | SafeLoad
<uint32_t>(in
+ 4) << 4, SafeLoad
<uint32_t>(in
+ 4) >> 27 | SafeLoad
<uint32_t>(in
+ 5) << 5, SafeLoad
<uint32_t>(in
+ 5) >> 26 | SafeLoad
<uint32_t>(in
+ 6) << 6, SafeLoad
<uint32_t>(in
+ 6) >> 25 | SafeLoad
<uint32_t>(in
+ 7) << 7 };
1228 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1229 results
= (words
>> shifts
) & masks
;
1230 results
.store_unaligned(out
);
1233 // extract 31-bit bundles 8 to 15
1234 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 7) >> 24 | SafeLoad
<uint32_t>(in
+ 8) << 8, SafeLoad
<uint32_t>(in
+ 8) >> 23 | SafeLoad
<uint32_t>(in
+ 9) << 9, SafeLoad
<uint32_t>(in
+ 9) >> 22 | SafeLoad
<uint32_t>(in
+ 10) << 10, SafeLoad
<uint32_t>(in
+ 10) >> 21 | SafeLoad
<uint32_t>(in
+ 11) << 11, SafeLoad
<uint32_t>(in
+ 11) >> 20 | SafeLoad
<uint32_t>(in
+ 12) << 12, SafeLoad
<uint32_t>(in
+ 12) >> 19 | SafeLoad
<uint32_t>(in
+ 13) << 13, SafeLoad
<uint32_t>(in
+ 13) >> 18 | SafeLoad
<uint32_t>(in
+ 14) << 14, SafeLoad
<uint32_t>(in
+ 14) >> 17 | SafeLoad
<uint32_t>(in
+ 15) << 15 };
1235 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1236 results
= (words
>> shifts
) & masks
;
1237 results
.store_unaligned(out
);
1240 // extract 31-bit bundles 16 to 23
1241 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 15) >> 16 | SafeLoad
<uint32_t>(in
+ 16) << 16, SafeLoad
<uint32_t>(in
+ 16) >> 15 | SafeLoad
<uint32_t>(in
+ 17) << 17, SafeLoad
<uint32_t>(in
+ 17) >> 14 | SafeLoad
<uint32_t>(in
+ 18) << 18, SafeLoad
<uint32_t>(in
+ 18) >> 13 | SafeLoad
<uint32_t>(in
+ 19) << 19, SafeLoad
<uint32_t>(in
+ 19) >> 12 | SafeLoad
<uint32_t>(in
+ 20) << 20, SafeLoad
<uint32_t>(in
+ 20) >> 11 | SafeLoad
<uint32_t>(in
+ 21) << 21, SafeLoad
<uint32_t>(in
+ 21) >> 10 | SafeLoad
<uint32_t>(in
+ 22) << 22, SafeLoad
<uint32_t>(in
+ 22) >> 9 | SafeLoad
<uint32_t>(in
+ 23) << 23 };
1242 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 0 };
1243 results
= (words
>> shifts
) & masks
;
1244 results
.store_unaligned(out
);
1247 // extract 31-bit bundles 24 to 31
1248 words
= simd_batch
{ SafeLoad
<uint32_t>(in
+ 23) >> 8 | SafeLoad
<uint32_t>(in
+ 24) << 24, SafeLoad
<uint32_t>(in
+ 24) >> 7 | SafeLoad
<uint32_t>(in
+ 25) << 25, SafeLoad
<uint32_t>(in
+ 25) >> 6 | SafeLoad
<uint32_t>(in
+ 26) << 26, SafeLoad
<uint32_t>(in
+ 26) >> 5 | SafeLoad
<uint32_t>(in
+ 27) << 27, SafeLoad
<uint32_t>(in
+ 27) >> 4 | SafeLoad
<uint32_t>(in
+ 28) << 28, SafeLoad
<uint32_t>(in
+ 28) >> 3 | SafeLoad
<uint32_t>(in
+ 29) << 29, SafeLoad
<uint32_t>(in
+ 29) >> 2 | SafeLoad
<uint32_t>(in
+ 30) << 30, SafeLoad
<uint32_t>(in
+ 30) };
1249 shifts
= simd_batch
{ 0, 0, 0, 0, 0, 0, 0, 1 };
1250 results
= (words
>> shifts
) & masks
;
1251 results
.store_unaligned(out
);
1258 inline static const uint32_t* unpack32_32(const uint32_t* in
, uint32_t* out
) {
1259 memcpy(out
, in
, 32 * sizeof(*out
));
1266 }; // struct UnpackBits256
1269 } // namespace internal
1270 } // namespace arrow