]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/go/parquet/internal/utils/_lib/bit_packing_avx2.c
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / go / parquet / internal / utils / _lib / bit_packing_avx2.c
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16
17 #include <stdint.h>
18 #include <immintrin.h>
19 #include <string.h>
20
21 inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
22 memset(out, 0x0, 32 * sizeof(*out));
23 out += 32;
24
25 return in;
26 }
27
28 inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) {
29 uint32_t mask = 0x1;
30 __m256i reg_shifts, reg_inls, reg_masks;
31 __m256i results;
32
33 reg_masks = _mm256_set1_epi32(mask);
34
35 // shift the first 8 outs
36 reg_shifts = _mm256_set_epi32(7, 6, 5, 4,
37 3, 2, 1, 0);
38 reg_inls = _mm256_set_epi32(in[0], in[0],
39 in[0], in[0],
40 in[0], in[0],
41 in[0], in[0]);
42 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
43 _mm256_storeu_si256((__m256i*)(out), results);
44 out += 8;
45
46 // shift the second 8 outs
47 reg_shifts = _mm256_set_epi32(15, 14, 13, 12,
48 11, 10, 9, 8);
49 reg_inls = _mm256_set_epi32(in[0], in[0],
50 in[0], in[0],
51 in[0], in[0],
52 in[0], in[0]);
53 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
54 _mm256_storeu_si256((__m256i*)(out), results);
55 out += 8;
56
57 // shift the third 8 outs
58 reg_shifts = _mm256_set_epi32(23, 22, 21, 20,
59 19, 18, 17, 16);
60 reg_inls = _mm256_set_epi32(in[0], in[0],
61 in[0], in[0],
62 in[0], in[0],
63 in[0], in[0]);
64 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
65 _mm256_storeu_si256((__m256i*)(out), results);
66 out += 8;
67
68 // shift the last 8 outs
69 reg_shifts = _mm256_set_epi32(31, 30, 29, 28,
70 27, 26, 25, 24);
71 reg_inls = _mm256_set_epi32(in[0], in[0],
72 in[0], in[0],
73 in[0], in[0],
74 in[0], in[0]);
75 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
76 _mm256_storeu_si256((__m256i*)(out), results);
77 out += 8;
78
79 in += 1;
80
81 return in;
82 }
83
84 inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) {
85 uint32_t mask = 0x3;
86 __m256i reg_shifts, reg_inls, reg_masks;
87 __m256i results;
88
89 reg_masks = _mm256_set1_epi32(mask);
90
91 // shift the first 8 outs
92 reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
93 6, 4, 2, 0);
94 reg_inls = _mm256_set_epi32(in[0], in[0],
95 in[0], in[0],
96 in[0], in[0],
97 in[0], in[0]);
98 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
99 _mm256_storeu_si256((__m256i*)(out), results);
100 out += 8;
101
102 // shift the second 8 outs
103 reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
104 22, 20, 18, 16);
105 reg_inls = _mm256_set_epi32(in[0], in[0],
106 in[0], in[0],
107 in[0], in[0],
108 in[0], in[0]);
109 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
110 _mm256_storeu_si256((__m256i*)(out), results);
111 out += 8;
112
113 // shift the third 8 outs
114 reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
115 6, 4, 2, 0);
116 reg_inls = _mm256_set_epi32(in[1], in[1],
117 in[1], in[1],
118 in[1], in[1],
119 in[1], in[1]);
120 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
121 _mm256_storeu_si256((__m256i*)(out), results);
122 out += 8;
123
124 // shift the last 8 outs
125 reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
126 22, 20, 18, 16);
127 reg_inls = _mm256_set_epi32(in[1], in[1],
128 in[1], in[1],
129 in[1], in[1],
130 in[1], in[1]);
131 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
132 _mm256_storeu_si256((__m256i*)(out), results);
133 out += 8;
134
135 in += 2;
136
137 return in;
138 }
139
140 inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) {
141 uint32_t mask = 0x7;
142 __m256i reg_shifts, reg_inls, reg_masks;
143 __m256i results;
144
145 reg_masks = _mm256_set1_epi32(mask);
146
147 // shift the first 8 outs
148 reg_shifts = _mm256_set_epi32(21, 18, 15, 12,
149 9, 6, 3, 0);
150 reg_inls = _mm256_set_epi32(in[0], in[0],
151 in[0], in[0],
152 in[0], in[0],
153 in[0], in[0]);
154 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
155 _mm256_storeu_si256((__m256i*)(out), results);
156 out += 8;
157
158 // shift the second 8 outs
159 reg_shifts = _mm256_set_epi32(13, 10, 7, 4,
160 1, 0, 27, 24);
161 reg_inls = _mm256_set_epi32(in[1], in[1],
162 in[1], in[1],
163 in[1], in[0] >> 30 | in[1] << 2,
164 in[0], in[0]);
165 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
166 _mm256_storeu_si256((__m256i*)(out), results);
167 out += 8;
168
169 // shift the third 8 outs
170 reg_shifts = _mm256_set_epi32(5, 2, 0, 28,
171 25, 22, 19, 16);
172 reg_inls = _mm256_set_epi32(in[2], in[2],
173 in[1] >> 31 | in[2] << 1, in[1],
174 in[1], in[1],
175 in[1], in[1]);
176 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
177 _mm256_storeu_si256((__m256i*)(out), results);
178 out += 8;
179
180 // shift the last 8 outs
181 reg_shifts = _mm256_set_epi32(29, 26, 23, 20,
182 17, 14, 11, 8);
183 reg_inls = _mm256_set_epi32(in[2], in[2],
184 in[2], in[2],
185 in[2], in[2],
186 in[2], in[2]);
187 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
188 _mm256_storeu_si256((__m256i*)(out), results);
189 out += 8;
190
191 in += 3;
192
193 return in;
194 }
195
196 inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) {
197 uint32_t mask = 0xf;
198 __m256i reg_shifts, reg_inls, reg_masks;
199 __m256i results;
200
201 reg_masks = _mm256_set1_epi32(mask);
202
203 // shift the first 8 outs
204 reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
205 12, 8, 4, 0);
206 reg_inls = _mm256_set_epi32(in[0], in[0],
207 in[0], in[0],
208 in[0], in[0],
209 in[0], in[0]);
210 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
211 _mm256_storeu_si256((__m256i*)(out), results);
212 out += 8;
213
214 // shift the second 8 outs
215 reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
216 12, 8, 4, 0);
217 reg_inls = _mm256_set_epi32(in[1], in[1],
218 in[1], in[1],
219 in[1], in[1],
220 in[1], in[1]);
221 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
222 _mm256_storeu_si256((__m256i*)(out), results);
223 out += 8;
224
225 // shift the third 8 outs
226 reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
227 12, 8, 4, 0);
228 reg_inls = _mm256_set_epi32(in[2], in[2],
229 in[2], in[2],
230 in[2], in[2],
231 in[2], in[2]);
232 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
233 _mm256_storeu_si256((__m256i*)(out), results);
234 out += 8;
235
236 // shift the last 8 outs
237 reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
238 12, 8, 4, 0);
239 reg_inls = _mm256_set_epi32(in[3], in[3],
240 in[3], in[3],
241 in[3], in[3],
242 in[3], in[3]);
243 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
244 _mm256_storeu_si256((__m256i*)(out), results);
245 out += 8;
246
247 in += 4;
248
249 return in;
250 }
251
252 inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) {
253 uint32_t mask = 0x1f;
254 __m256i reg_shifts, reg_inls, reg_masks;
255 __m256i results;
256
257 reg_masks = _mm256_set1_epi32(mask);
258
259 // shift the first 8 outs
260 reg_shifts = _mm256_set_epi32(3, 0, 25, 20,
261 15, 10, 5, 0);
262 reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2,
263 in[0], in[0],
264 in[0], in[0],
265 in[0], in[0]);
266 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
267 _mm256_storeu_si256((__m256i*)(out), results);
268 out += 8;
269
270 // shift the second 8 outs
271 reg_shifts = _mm256_set_epi32(11, 6, 1, 0,
272 23, 18, 13, 8);
273 reg_inls = _mm256_set_epi32(in[2], in[2],
274 in[2], in[1] >> 28 | in[2] << 4,
275 in[1], in[1],
276 in[1], in[1]);
277 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
278 _mm256_storeu_si256((__m256i*)(out), results);
279 out += 8;
280
281 // shift the third 8 outs
282 reg_shifts = _mm256_set_epi32(19, 14, 9, 4,
283 0, 26, 21, 16);
284 reg_inls = _mm256_set_epi32(in[3], in[3],
285 in[3], in[3],
286 in[2] >> 31 | in[3] << 1, in[2],
287 in[2], in[2]);
288 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
289 _mm256_storeu_si256((__m256i*)(out), results);
290 out += 8;
291
292 // shift the last 8 outs
293 reg_shifts = _mm256_set_epi32(27, 22, 17, 12,
294 7, 2, 0, 24);
295 reg_inls = _mm256_set_epi32(in[4], in[4],
296 in[4], in[4],
297 in[4], in[4],
298 in[3] >> 29 | in[4] << 3, in[3]);
299 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
300 _mm256_storeu_si256((__m256i*)(out), results);
301 out += 8;
302
303 in += 5;
304
305 return in;
306 }
307
308 inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) {
309 uint32_t mask = 0x3f;
310 __m256i reg_shifts, reg_inls, reg_masks;
311 __m256i results;
312
313 reg_masks = _mm256_set1_epi32(mask);
314
315 // shift the first 8 outs
316 reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
317 18, 12, 6, 0);
318 reg_inls = _mm256_set_epi32(in[1], in[1],
319 in[0] >> 30 | in[1] << 2, in[0],
320 in[0], in[0],
321 in[0], in[0]);
322 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
323 _mm256_storeu_si256((__m256i*)(out), results);
324 out += 8;
325
326 // shift the second 8 outs
327 reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
328 2, 0, 22, 16);
329 reg_inls = _mm256_set_epi32(in[2], in[2],
330 in[2], in[2],
331 in[2], in[1] >> 28 | in[2] << 4,
332 in[1], in[1]);
333 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
334 _mm256_storeu_si256((__m256i*)(out), results);
335 out += 8;
336
337 // shift the third 8 outs
338 reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
339 18, 12, 6, 0);
340 reg_inls = _mm256_set_epi32(in[4], in[4],
341 in[3] >> 30 | in[4] << 2, in[3],
342 in[3], in[3],
343 in[3], in[3]);
344 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
345 _mm256_storeu_si256((__m256i*)(out), results);
346 out += 8;
347
348 // shift the last 8 outs
349 reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
350 2, 0, 22, 16);
351 reg_inls = _mm256_set_epi32(in[5], in[5],
352 in[5], in[5],
353 in[5], in[4] >> 28 | in[5] << 4,
354 in[4], in[4]);
355 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
356 _mm256_storeu_si256((__m256i*)(out), results);
357 out += 8;
358
359 in += 6;
360
361 return in;
362 }
363
364 inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) {
365 uint32_t mask = 0x7f;
366 __m256i reg_shifts, reg_inls, reg_masks;
367 __m256i results;
368
369 reg_masks = _mm256_set1_epi32(mask);
370
371 // shift the first 8 outs
372 reg_shifts = _mm256_set_epi32(17, 10, 3, 0,
373 21, 14, 7, 0);
374 reg_inls = _mm256_set_epi32(in[1], in[1],
375 in[1], in[0] >> 28 | in[1] << 4,
376 in[0], in[0],
377 in[0], in[0]);
378 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
379 _mm256_storeu_si256((__m256i*)(out), results);
380 out += 8;
381
382 // shift the second 8 outs
383 reg_shifts = _mm256_set_epi32(9, 2, 0, 20,
384 13, 6, 0, 24);
385 reg_inls = _mm256_set_epi32(in[3], in[3],
386 in[2] >> 27 | in[3] << 5, in[2],
387 in[2], in[2],
388 in[1] >> 31 | in[2] << 1, in[1]);
389 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
390 _mm256_storeu_si256((__m256i*)(out), results);
391 out += 8;
392
393 // shift the third 8 outs
394 reg_shifts = _mm256_set_epi32(1, 0, 19, 12,
395 5, 0, 23, 16);
396 reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
397 in[4], in[4],
398 in[4], in[3] >> 30 | in[4] << 2,
399 in[3], in[3]);
400 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
401 _mm256_storeu_si256((__m256i*)(out), results);
402 out += 8;
403
404 // shift the last 8 outs
405 reg_shifts = _mm256_set_epi32(25, 18, 11, 4,
406 0, 22, 15, 8);
407 reg_inls = _mm256_set_epi32(in[6], in[6],
408 in[6], in[6],
409 in[5] >> 29 | in[6] << 3, in[5],
410 in[5], in[5]);
411 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
412 _mm256_storeu_si256((__m256i*)(out), results);
413 out += 8;
414
415 in += 7;
416
417 return in;
418 }
419
420 inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) {
421 uint32_t mask = 0xff;
422 __m256i reg_shifts, reg_inls, reg_masks;
423 __m256i results;
424
425 reg_masks = _mm256_set1_epi32(mask);
426
427 // shift the first 8 outs
428 reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
429 24, 16, 8, 0);
430 reg_inls = _mm256_set_epi32(in[1], in[1],
431 in[1], in[1],
432 in[0], in[0],
433 in[0], in[0]);
434 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
435 _mm256_storeu_si256((__m256i*)(out), results);
436 out += 8;
437
438 // shift the second 8 outs
439 reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
440 24, 16, 8, 0);
441 reg_inls = _mm256_set_epi32(in[3], in[3],
442 in[3], in[3],
443 in[2], in[2],
444 in[2], in[2]);
445 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
446 _mm256_storeu_si256((__m256i*)(out), results);
447 out += 8;
448
449 // shift the third 8 outs
450 reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
451 24, 16, 8, 0);
452 reg_inls = _mm256_set_epi32(in[5], in[5],
453 in[5], in[5],
454 in[4], in[4],
455 in[4], in[4]);
456 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
457 _mm256_storeu_si256((__m256i*)(out), results);
458 out += 8;
459
460 // shift the last 8 outs
461 reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
462 24, 16, 8, 0);
463 reg_inls = _mm256_set_epi32(in[7], in[7],
464 in[7], in[7],
465 in[6], in[6],
466 in[6], in[6]);
467 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
468 _mm256_storeu_si256((__m256i*)(out), results);
469 out += 8;
470
471 in += 8;
472
473 return in;
474 }
475
476 inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) {
477 uint32_t mask = 0x1ff;
478 __m256i reg_shifts, reg_inls, reg_masks;
479 __m256i results;
480
481 reg_masks = _mm256_set1_epi32(mask);
482
483 // shift the first 8 outs
484 reg_shifts = _mm256_set_epi32(0, 22, 13, 4,
485 0, 18, 9, 0);
486 reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1],
487 in[1], in[1],
488 in[0] >> 27 | in[1] << 5, in[0],
489 in[0], in[0]);
490 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
491 _mm256_storeu_si256((__m256i*)(out), results);
492 out += 8;
493
494 // shift the second 8 outs
495 reg_shifts = _mm256_set_epi32(7, 0, 21, 12,
496 3, 0, 17, 8);
497 reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
498 in[3], in[3],
499 in[3], in[2] >> 26 | in[3] << 6,
500 in[2], in[2]);
501 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
502 _mm256_storeu_si256((__m256i*)(out), results);
503 out += 8;
504
505 // shift the third 8 outs
506 reg_shifts = _mm256_set_epi32(15, 6, 0, 20,
507 11, 2, 0, 16);
508 reg_inls = _mm256_set_epi32(in[6], in[6],
509 in[5] >> 29 | in[6] << 3, in[5],
510 in[5], in[5],
511 in[4] >> 25 | in[5] << 7, in[4]);
512 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
513 _mm256_storeu_si256((__m256i*)(out), results);
514 out += 8;
515
516 // shift the last 8 outs
517 reg_shifts = _mm256_set_epi32(23, 14, 5, 0,
518 19, 10, 1, 0);
519 reg_inls = _mm256_set_epi32(in[8], in[8],
520 in[8], in[7] >> 28 | in[8] << 4,
521 in[7], in[7],
522 in[7], in[6] >> 24 | in[7] << 8);
523 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
524 _mm256_storeu_si256((__m256i*)(out), results);
525 out += 8;
526
527 in += 9;
528
529 return in;
530 }
531
532 inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) {
533 uint32_t mask = 0x3ff;
534 __m256i reg_shifts, reg_inls, reg_masks;
535 __m256i results;
536
537 reg_masks = _mm256_set1_epi32(mask);
538
539 // shift the first 8 outs
540 reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
541 0, 20, 10, 0);
542 reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4,
543 in[1], in[1],
544 in[0] >> 30 | in[1] << 2, in[0],
545 in[0], in[0]);
546 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
547 _mm256_storeu_si256((__m256i*)(out), results);
548 out += 8;
549
550 // shift the second 8 outs
551 reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
552 14, 4, 0, 16);
553 reg_inls = _mm256_set_epi32(in[4], in[4],
554 in[4], in[3] >> 24 | in[4] << 8,
555 in[3], in[3],
556 in[2] >> 26 | in[3] << 6, in[2]);
557 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
558 _mm256_storeu_si256((__m256i*)(out), results);
559 out += 8;
560
561 // shift the third 8 outs
562 reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
563 0, 20, 10, 0);
564 reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4,
565 in[6], in[6],
566 in[5] >> 30 | in[6] << 2, in[5],
567 in[5], in[5]);
568 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
569 _mm256_storeu_si256((__m256i*)(out), results);
570 out += 8;
571
572 // shift the last 8 outs
573 reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
574 14, 4, 0, 16);
575 reg_inls = _mm256_set_epi32(in[9], in[9],
576 in[9], in[8] >> 24 | in[9] << 8,
577 in[8], in[8],
578 in[7] >> 26 | in[8] << 6, in[7]);
579 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
580 _mm256_storeu_si256((__m256i*)(out), results);
581 out += 8;
582
583 in += 10;
584
585 return in;
586 }
587
588 inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) {
589 uint32_t mask = 0x7ff;
590 __m256i reg_shifts, reg_inls, reg_masks;
591 __m256i results;
592
593 reg_masks = _mm256_set1_epi32(mask);
594
595 // shift the first 8 outs
596 reg_shifts = _mm256_set_epi32(13, 2, 0, 12,
597 1, 0, 11, 0);
598 reg_inls = _mm256_set_epi32(in[2], in[2],
599 in[1] >> 23 | in[2] << 9, in[1],
600 in[1], in[0] >> 22 | in[1] << 10,
601 in[0], in[0]);
602 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
603 _mm256_storeu_si256((__m256i*)(out), results);
604 out += 8;
605
606 // shift the second 8 outs
607 reg_shifts = _mm256_set_epi32(5, 0, 15, 4,
608 0, 14, 3, 0);
609 reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
610 in[4], in[4],
611 in[3] >> 25 | in[4] << 7, in[3],
612 in[3], in[2] >> 24 | in[3] << 8);
613 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
614 _mm256_storeu_si256((__m256i*)(out), results);
615 out += 8;
616
617 // shift the third 8 outs
618 reg_shifts = _mm256_set_epi32(0, 18, 7, 0,
619 17, 6, 0, 16);
620 reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7],
621 in[7], in[6] >> 28 | in[7] << 4,
622 in[6], in[6],
623 in[5] >> 27 | in[6] << 5, in[5]);
624 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
625 _mm256_storeu_si256((__m256i*)(out), results);
626 out += 8;
627
628 // shift the last 8 outs
629 reg_shifts = _mm256_set_epi32(21, 10, 0, 20,
630 9, 0, 19, 8);
631 reg_inls = _mm256_set_epi32(in[10], in[10],
632 in[9] >> 31 | in[10] << 1, in[9],
633 in[9], in[8] >> 30 | in[9] << 2,
634 in[8], in[8]);
635 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
636 _mm256_storeu_si256((__m256i*)(out), results);
637 out += 8;
638
639 in += 11;
640
641 return in;
642 }
643
644 inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) {
645 uint32_t mask = 0xfff;
646 __m256i reg_shifts, reg_inls, reg_masks;
647 __m256i results;
648
649 reg_masks = _mm256_set1_epi32(mask);
650
651 // shift the first 8 outs
652 reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
653 4, 0, 12, 0);
654 reg_inls = _mm256_set_epi32(in[2], in[2],
655 in[1] >> 28 | in[2] << 4, in[1],
656 in[1], in[0] >> 24 | in[1] << 8,
657 in[0], in[0]);
658 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
659 _mm256_storeu_si256((__m256i*)(out), results);
660 out += 8;
661
662 // shift the second 8 outs
663 reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
664 4, 0, 12, 0);
665 reg_inls = _mm256_set_epi32(in[5], in[5],
666 in[4] >> 28 | in[5] << 4, in[4],
667 in[4], in[3] >> 24 | in[4] << 8,
668 in[3], in[3]);
669 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
670 _mm256_storeu_si256((__m256i*)(out), results);
671 out += 8;
672
673 // shift the third 8 outs
674 reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
675 4, 0, 12, 0);
676 reg_inls = _mm256_set_epi32(in[8], in[8],
677 in[7] >> 28 | in[8] << 4, in[7],
678 in[7], in[6] >> 24 | in[7] << 8,
679 in[6], in[6]);
680 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
681 _mm256_storeu_si256((__m256i*)(out), results);
682 out += 8;
683
684 // shift the last 8 outs
685 reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
686 4, 0, 12, 0);
687 reg_inls = _mm256_set_epi32(in[11], in[11],
688 in[10] >> 28 | in[11] << 4, in[10],
689 in[10], in[9] >> 24 | in[10] << 8,
690 in[9], in[9]);
691 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
692 _mm256_storeu_si256((__m256i*)(out), results);
693 out += 8;
694
695 in += 12;
696
697 return in;
698 }
699
700 inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) {
701 uint32_t mask = 0x1fff;
702 __m256i reg_shifts, reg_inls, reg_masks;
703 __m256i results;
704
705 reg_masks = _mm256_set1_epi32(mask);
706
707 // shift the first 8 outs
708 reg_shifts = _mm256_set_epi32(0, 14, 1, 0,
709 7, 0, 13, 0);
710 reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2],
711 in[2], in[1] >> 20 | in[2] << 12,
712 in[1], in[0] >> 26 | in[1] << 6,
713 in[0], in[0]);
714 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
715 _mm256_storeu_si256((__m256i*)(out), results);
716 out += 8;
717
718 // shift the second 8 outs
719 reg_shifts = _mm256_set_epi32(3, 0, 9, 0,
720 15, 2, 0, 8);
721 reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
722 in[5], in[4] >> 28 | in[5] << 4,
723 in[4], in[4],
724 in[3] >> 21 | in[4] << 11, in[3]);
725 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
726 _mm256_storeu_si256((__m256i*)(out), results);
727 out += 8;
728
729 // shift the third 8 outs
730 reg_shifts = _mm256_set_epi32(11, 0, 17, 4,
731 0, 10, 0, 16);
732 reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2,
733 in[8], in[8],
734 in[7] >> 23 | in[8] << 9, in[7],
735 in[6] >> 29 | in[7] << 3, in[6]);
736 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
737 _mm256_storeu_si256((__m256i*)(out), results);
738 out += 8;
739
740 // shift the last 8 outs
741 reg_shifts = _mm256_set_epi32(19, 6, 0, 12,
742 0, 18, 5, 0);
743 reg_inls = _mm256_set_epi32(in[12], in[12],
744 in[11] >> 25 | in[12] << 7, in[11],
745 in[10] >> 31 | in[11] << 1, in[10],
746 in[10], in[9] >> 24 | in[10] << 8);
747 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
748 _mm256_storeu_si256((__m256i*)(out), results);
749 out += 8;
750
751 in += 13;
752
753 return in;
754 }
755
756 inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) {
757 uint32_t mask = 0x3fff;
758 __m256i reg_shifts, reg_inls, reg_masks;
759 __m256i results;
760
761 reg_masks = _mm256_set1_epi32(mask);
762
763 // shift the first 8 outs
764 reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
765 10, 0, 14, 0);
766 reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12,
767 in[2], in[1] >> 24 | in[2] << 8,
768 in[1], in[0] >> 28 | in[1] << 4,
769 in[0], in[0]);
770 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
771 _mm256_storeu_si256((__m256i*)(out), results);
772 out += 8;
773
774 // shift the second 8 outs
775 reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
776 0, 12, 0, 16);
777 reg_inls = _mm256_set_epi32(in[6], in[6],
778 in[5] >> 22 | in[6] << 10, in[5],
779 in[4] >> 26 | in[5] << 6, in[4],
780 in[3] >> 30 | in[4] << 2, in[3]);
781 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
782 _mm256_storeu_si256((__m256i*)(out), results);
783 out += 8;
784
785 // shift the third 8 outs
786 reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
787 10, 0, 14, 0);
788 reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
789 in[9], in[8] >> 24 | in[9] << 8,
790 in[8], in[7] >> 28 | in[8] << 4,
791 in[7], in[7]);
792 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
793 _mm256_storeu_si256((__m256i*)(out), results);
794 out += 8;
795
796 // shift the last 8 outs
797 reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
798 0, 12, 0, 16);
799 reg_inls = _mm256_set_epi32(in[13], in[13],
800 in[12] >> 22 | in[13] << 10, in[12],
801 in[11] >> 26 | in[12] << 6, in[11],
802 in[10] >> 30 | in[11] << 2, in[10]);
803 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
804 _mm256_storeu_si256((__m256i*)(out), results);
805 out += 8;
806
807 in += 14;
808
809 return in;
810 }
811
812 inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) {
813 uint32_t mask = 0x7fff;
814 __m256i reg_shifts, reg_inls, reg_masks;
815 __m256i results;
816
817 reg_masks = _mm256_set1_epi32(mask);
818
819 // shift the first 8 outs
820 reg_shifts = _mm256_set_epi32(9, 0, 11, 0,
821 13, 0, 15, 0);
822 reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6,
823 in[2], in[1] >> 28 | in[2] << 4,
824 in[1], in[0] >> 30 | in[1] << 2,
825 in[0], in[0]);
826 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
827 _mm256_storeu_si256((__m256i*)(out), results);
828 out += 8;
829
830 // shift the second 8 outs
831 reg_shifts = _mm256_set_epi32(1, 0, 3, 0,
832 5, 0, 7, 0);
833 reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
834 in[6], in[5] >> 20 | in[6] << 12,
835 in[5], in[4] >> 22 | in[5] << 10,
836 in[4], in[3] >> 24 | in[4] << 8);
837 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
838 _mm256_storeu_si256((__m256i*)(out), results);
839 out += 8;
840
841 // shift the third 8 outs
842 reg_shifts = _mm256_set_epi32(0, 10, 0, 12,
843 0, 14, 0, 16);
844 reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
845 in[9] >> 27 | in[10] << 5, in[9],
846 in[8] >> 29 | in[9] << 3, in[8],
847 in[7] >> 31 | in[8] << 1, in[7]);
848 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
849 _mm256_storeu_si256((__m256i*)(out), results);
850 out += 8;
851
852 // shift the last 8 outs
853 reg_shifts = _mm256_set_epi32(17, 2, 0, 4,
854 0, 6, 0, 8);
855 reg_inls = _mm256_set_epi32(in[14], in[14],
856 in[13] >> 19 | in[14] << 13, in[13],
857 in[12] >> 21 | in[13] << 11, in[12],
858 in[11] >> 23 | in[12] << 9, in[11]);
859 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
860 _mm256_storeu_si256((__m256i*)(out), results);
861 out += 8;
862
863 in += 15;
864
865 return in;
866 }
867
868 inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) {
869 uint32_t mask = 0xffff;
870 __m256i reg_shifts, reg_inls, reg_masks;
871 __m256i results;
872
873 reg_masks = _mm256_set1_epi32(mask);
874
875 // shift the first 8 outs
876 reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
877 16, 0, 16, 0);
878 reg_inls = _mm256_set_epi32(in[3], in[3],
879 in[2], in[2],
880 in[1], in[1],
881 in[0], in[0]);
882 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
883 _mm256_storeu_si256((__m256i*)(out), results);
884 out += 8;
885
886 // shift the second 8 outs
887 reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
888 16, 0, 16, 0);
889 reg_inls = _mm256_set_epi32(in[7], in[7],
890 in[6], in[6],
891 in[5], in[5],
892 in[4], in[4]);
893 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
894 _mm256_storeu_si256((__m256i*)(out), results);
895 out += 8;
896
897 // shift the third 8 outs
898 reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
899 16, 0, 16, 0);
900 reg_inls = _mm256_set_epi32(in[11], in[11],
901 in[10], in[10],
902 in[9], in[9],
903 in[8], in[8]);
904 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
905 _mm256_storeu_si256((__m256i*)(out), results);
906 out += 8;
907
908 // shift the last 8 outs
909 reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
910 16, 0, 16, 0);
911 reg_inls = _mm256_set_epi32(in[15], in[15],
912 in[14], in[14],
913 in[13], in[13],
914 in[12], in[12]);
915 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
916 _mm256_storeu_si256((__m256i*)(out), results);
917 out += 8;
918
919 in += 16;
920
921 return in;
922 }
923
924 inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) {
925 uint32_t mask = 0x1ffff;
926 __m256i reg_shifts, reg_inls, reg_masks;
927 __m256i results;
928
929 reg_masks = _mm256_set1_epi32(mask);
930
931 // shift the first 8 outs
932 reg_shifts = _mm256_set_epi32(0, 6, 0, 4,
933 0, 2, 0, 0);
934 reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3],
935 in[2] >> 21 | in[3] << 11, in[2],
936 in[1] >> 19 | in[2] << 13, in[1],
937 in[0] >> 17 | in[1] << 15, in[0]);
938 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
939 _mm256_storeu_si256((__m256i*)(out), results);
940 out += 8;
941
942 // shift the second 8 outs
943 reg_shifts = _mm256_set_epi32(0, 14, 0, 12,
944 0, 10, 0, 8);
945 reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
946 in[6] >> 29 | in[7] << 3, in[6],
947 in[5] >> 27 | in[6] << 5, in[5],
948 in[4] >> 25 | in[5] << 7, in[4]);
949 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
950 _mm256_storeu_si256((__m256i*)(out), results);
951 out += 8;
952
953 // shift the third 8 outs
954 reg_shifts = _mm256_set_epi32(7, 0, 5, 0,
955 3, 0, 1, 0);
956 reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10,
957 in[11], in[10] >> 20 | in[11] << 12,
958 in[10], in[9] >> 18 | in[10] << 14,
959 in[9], in[8] >> 16 | in[9] << 16);
960 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
961 _mm256_storeu_si256((__m256i*)(out), results);
962 out += 8;
963
964 // shift the last 8 outs
965 reg_shifts = _mm256_set_epi32(15, 0, 13, 0,
966 11, 0, 9, 0);
967 reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
968 in[15], in[14] >> 28 | in[15] << 4,
969 in[14], in[13] >> 26 | in[14] << 6,
970 in[13], in[12] >> 24 | in[13] << 8);
971 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
972 _mm256_storeu_si256((__m256i*)(out), results);
973 out += 8;
974
975 in += 17;
976
977 return in;
978 }
979
980 inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) {
981 uint32_t mask = 0x3ffff;
982 __m256i reg_shifts, reg_inls, reg_masks;
983 __m256i results;
984
985 reg_masks = _mm256_set1_epi32(mask);
986
987 // shift the first 8 outs
988 reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
989 0, 4, 0, 0);
990 reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3],
991 in[2] >> 26 | in[3] << 6, in[2],
992 in[1] >> 22 | in[2] << 10, in[1],
993 in[0] >> 18 | in[1] << 14, in[0]);
994 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
995 _mm256_storeu_si256((__m256i*)(out), results);
996 out += 8;
997
998 // shift the second 8 outs
999 reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
1000 6, 0, 2, 0);
1001 reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
1002 in[7], in[6] >> 24 | in[7] << 8,
1003 in[6], in[5] >> 20 | in[6] << 12,
1004 in[5], in[4] >> 16 | in[5] << 16);
1005 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1006 _mm256_storeu_si256((__m256i*)(out), results);
1007 out += 8;
1008
1009 // shift the third 8 outs
1010 reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
1011 0, 4, 0, 0);
1012 reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12],
1013 in[11] >> 26 | in[12] << 6, in[11],
1014 in[10] >> 22 | in[11] << 10, in[10],
1015 in[9] >> 18 | in[10] << 14, in[9]);
1016 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1017 _mm256_storeu_si256((__m256i*)(out), results);
1018 out += 8;
1019
1020 // shift the last 8 outs
1021 reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
1022 6, 0, 2, 0);
1023 reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
1024 in[16], in[15] >> 24 | in[16] << 8,
1025 in[15], in[14] >> 20 | in[15] << 12,
1026 in[14], in[13] >> 16 | in[14] << 16);
1027 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1028 _mm256_storeu_si256((__m256i*)(out), results);
1029 out += 8;
1030
1031 in += 18;
1032
1033 return in;
1034 }
1035
1036 inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) {
1037 uint32_t mask = 0x7ffff;
1038 __m256i reg_shifts, reg_inls, reg_masks;
1039 __m256i results;
1040
1041 reg_masks = _mm256_set1_epi32(mask);
1042
1043 // shift the first 8 outs
1044 reg_shifts = _mm256_set_epi32(5, 0, 0, 12,
1045 0, 6, 0, 0);
1046 reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14,
1047 in[2] >> 31 | in[3] << 1, in[2],
1048 in[1] >> 25 | in[2] << 7, in[1],
1049 in[0] >> 19 | in[1] << 13, in[0]);
1050 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1051 _mm256_storeu_si256((__m256i*)(out), results);
1052 out += 8;
1053
1054 // shift the second 8 outs
1055 reg_shifts = _mm256_set_epi32(0, 10, 0, 4,
1056 0, 0, 11, 0);
1057 reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
1058 in[7] >> 23 | in[8] << 9, in[7],
1059 in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
1060 in[5], in[4] >> 24 | in[5] << 8);
1061 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1062 _mm256_storeu_si256((__m256i*)(out), results);
1063 out += 8;
1064
1065 // shift the third 8 outs
1066 reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
1067 9, 0, 3, 0);
1068 reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13],
1069 in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
1070 in[11], in[10] >> 22 | in[11] << 10,
1071 in[10], in[9] >> 16 | in[10] << 16);
1072 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1073 _mm256_storeu_si256((__m256i*)(out), results);
1074 out += 8;
1075
1076 // shift the last 8 outs
1077 reg_shifts = _mm256_set_epi32(13, 0, 7, 0,
1078 1, 0, 0, 8);
1079 reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
1080 in[17], in[16] >> 20 | in[17] << 12,
1081 in[16], in[15] >> 14 | in[16] << 18,
1082 in[14] >> 27 | in[15] << 5, in[14]);
1083 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1084 _mm256_storeu_si256((__m256i*)(out), results);
1085 out += 8;
1086
1087 in += 19;
1088
1089 return in;
1090 }
1091
1092 inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) {
1093 uint32_t mask = 0xfffff;
1094 __m256i reg_shifts, reg_inls, reg_masks;
1095 __m256i results;
1096
1097 reg_masks = _mm256_set1_epi32(mask);
1098
1099 // shift the first 8 outs
1100 reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
1101 0, 8, 0, 0);
1102 reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8,
1103 in[3], in[2] >> 16 | in[3] << 16,
1104 in[1] >> 28 | in[2] << 4, in[1],
1105 in[0] >> 20 | in[1] << 12, in[0]);
1106 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1107 _mm256_storeu_si256((__m256i*)(out), results);
1108 out += 8;
1109
1110 // shift the second 8 outs
1111 reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
1112 0, 8, 0, 0);
1113 reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
1114 in[8], in[7] >> 16 | in[8] << 16,
1115 in[6] >> 28 | in[7] << 4, in[6],
1116 in[5] >> 20 | in[6] << 12, in[5]);
1117 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1118 _mm256_storeu_si256((__m256i*)(out), results);
1119 out += 8;
1120
1121 // shift the third 8 outs
1122 reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
1123 0, 8, 0, 0);
1124 reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8,
1125 in[13], in[12] >> 16 | in[13] << 16,
1126 in[11] >> 28 | in[12] << 4, in[11],
1127 in[10] >> 20 | in[11] << 12, in[10]);
1128 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1129 _mm256_storeu_si256((__m256i*)(out), results);
1130 out += 8;
1131
1132 // shift the last 8 outs
1133 reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
1134 0, 8, 0, 0);
1135 reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
1136 in[18], in[17] >> 16 | in[18] << 16,
1137 in[16] >> 28 | in[17] << 4, in[16],
1138 in[15] >> 20 | in[16] << 12, in[15]);
1139 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1140 _mm256_storeu_si256((__m256i*)(out), results);
1141 out += 8;
1142
1143 in += 20;
1144
1145 return in;
1146 }
1147
1148 inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) {
1149 uint32_t mask = 0x1fffff;
1150 __m256i reg_shifts, reg_inls, reg_masks;
1151 __m256i results;
1152
1153 reg_masks = _mm256_set1_epi32(mask);
1154
1155 // shift the first 8 outs
1156 reg_shifts = _mm256_set_epi32(0, 0, 9, 0,
1157 0, 10, 0, 0);
1158 reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
1159 in[3], in[2] >> 20 | in[3] << 12,
1160 in[1] >> 31 | in[2] << 1, in[1],
1161 in[0] >> 21 | in[1] << 11, in[0]);
1162 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1163 _mm256_storeu_si256((__m256i*)(out), results);
1164 out += 8;
1165
1166 // shift the second 8 outs
1167 reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
1168 7, 0, 0, 8);
1169 reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
1170 in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
1171 in[7], in[6] >> 18 | in[7] << 14,
1172 in[5] >> 29 | in[6] << 3, in[5]);
1173 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1174 _mm256_storeu_si256((__m256i*)(out), results);
1175 out += 8;
1176
1177 // shift the third 8 outs
1178 reg_shifts = _mm256_set_epi32(3, 0, 0, 4,
1179 0, 0, 5, 0);
1180 reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18,
1181 in[13] >> 25 | in[14] << 7, in[13],
1182 in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
1183 in[11], in[10] >> 16 | in[11] << 16);
1184 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1185 _mm256_storeu_si256((__m256i*)(out), results);
1186 out += 8;
1187
1188 // shift the last 8 outs
1189 reg_shifts = _mm256_set_epi32(11, 0, 1, 0,
1190 0, 2, 0, 0);
1191 reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
1192 in[19], in[18] >> 12 | in[19] << 20,
1193 in[17] >> 23 | in[18] << 9, in[17],
1194 in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8);
1195 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1196 _mm256_storeu_si256((__m256i*)(out), results);
1197 out += 8;
1198
1199 in += 21;
1200
1201 return in;
1202 }
1203
1204 inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) {
1205 uint32_t mask = 0x3fffff;
1206 __m256i reg_shifts, reg_inls, reg_masks;
1207 __m256i results;
1208
1209 reg_masks = _mm256_set1_epi32(mask);
1210
1211 // shift the first 8 outs
1212 reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
1213 2, 0, 0, 0);
1214 reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4],
1215 in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
1216 in[2], in[1] >> 12 | in[2] << 20,
1217 in[0] >> 22 | in[1] << 10, in[0]);
1218 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1219 _mm256_storeu_si256((__m256i*)(out), results);
1220 out += 8;
1221
1222 // shift the second 8 outs
1223 reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
1224 0, 0, 6, 0);
1225 reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
1226 in[8] >> 30 | in[9] << 2, in[8],
1227 in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
1228 in[6], in[5] >> 16 | in[6] << 16);
1229 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1230 _mm256_storeu_si256((__m256i*)(out), results);
1231 out += 8;
1232
1233 // shift the third 8 outs
1234 reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
1235 2, 0, 0, 0);
1236 reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15],
1237 in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
1238 in[13], in[12] >> 12 | in[13] << 20,
1239 in[11] >> 22 | in[12] << 10, in[11]);
1240 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1241 _mm256_storeu_si256((__m256i*)(out), results);
1242 out += 8;
1243
1244 // shift the last 8 outs
1245 reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
1246 0, 0, 6, 0);
1247 reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
1248 in[19] >> 30 | in[20] << 2, in[19],
1249 in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
1250 in[17], in[16] >> 16 | in[17] << 16);
1251 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1252 _mm256_storeu_si256((__m256i*)(out), results);
1253 out += 8;
1254
1255 in += 22;
1256
1257 return in;
1258 }
1259
1260 inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) {
1261 uint32_t mask = 0x7fffff;
1262 __m256i reg_shifts, reg_inls, reg_masks;
1263 __m256i results;
1264
1265 reg_masks = _mm256_set1_epi32(mask);
1266
1267 // shift the first 8 outs
1268 reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
1269 5, 0, 0, 0);
1270 reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22,
1271 in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
1272 in[2], in[1] >> 14 | in[2] << 18,
1273 in[0] >> 23 | in[1] << 9, in[0]);
1274 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1275 _mm256_storeu_si256((__m256i*)(out), results);
1276 out += 8;
1277
1278 // shift the second 8 outs
1279 reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
1280 0, 6, 0, 0);
1281 reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
1282 in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
1283 in[7] >> 29 | in[8] << 3, in[7],
1284 in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8);
1285 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1286 _mm256_storeu_si256((__m256i*)(out), results);
1287 out += 8;
1288
1289 // shift the third 8 outs
1290 reg_shifts = _mm256_set_epi32(0, 0, 3, 0,
1291 0, 0, 7, 0);
1292 reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
1293 in[15], in[14] >> 12 | in[15] << 20,
1294 in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
1295 in[12], in[11] >> 16 | in[12] << 16);
1296 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1297 _mm256_storeu_si256((__m256i*)(out), results);
1298 out += 8;
1299
1300 // shift the last 8 outs
1301 reg_shifts = _mm256_set_epi32(9, 0, 0, 4,
1302 0, 0, 0, 8);
1303 reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
1304 in[20] >> 27 | in[21] << 5, in[20],
1305 in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
1306 in[17] >> 31 | in[18] << 1, in[17]);
1307 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1308 _mm256_storeu_si256((__m256i*)(out), results);
1309 out += 8;
1310
1311 in += 23;
1312
1313 return in;
1314 }
1315
1316 inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) {
1317 uint32_t mask = 0xffffff;
1318 __m256i reg_shifts, reg_inls, reg_masks;
1319 __m256i results;
1320
1321 reg_masks = _mm256_set1_epi32(mask);
1322
1323 // shift the first 8 outs
1324 reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
1325 8, 0, 0, 0);
1326 reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16,
1327 in[3] >> 24 | in[4] << 8, in[3],
1328 in[2], in[1] >> 16 | in[2] << 16,
1329 in[0] >> 24 | in[1] << 8, in[0]);
1330 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1331 _mm256_storeu_si256((__m256i*)(out), results);
1332 out += 8;
1333
1334 // shift the second 8 outs
1335 reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
1336 8, 0, 0, 0);
1337 reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
1338 in[9] >> 24 | in[10] << 8, in[9],
1339 in[8], in[7] >> 16 | in[8] << 16,
1340 in[6] >> 24 | in[7] << 8, in[6]);
1341 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1342 _mm256_storeu_si256((__m256i*)(out), results);
1343 out += 8;
1344
1345 // shift the third 8 outs
1346 reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
1347 8, 0, 0, 0);
1348 reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16,
1349 in[15] >> 24 | in[16] << 8, in[15],
1350 in[14], in[13] >> 16 | in[14] << 16,
1351 in[12] >> 24 | in[13] << 8, in[12]);
1352 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1353 _mm256_storeu_si256((__m256i*)(out), results);
1354 out += 8;
1355
1356 // shift the last 8 outs
1357 reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
1358 8, 0, 0, 0);
1359 reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
1360 in[21] >> 24 | in[22] << 8, in[21],
1361 in[20], in[19] >> 16 | in[20] << 16,
1362 in[18] >> 24 | in[19] << 8, in[18]);
1363 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1364 _mm256_storeu_si256((__m256i*)(out), results);
1365 out += 8;
1366
1367 in += 24;
1368
1369 return in;
1370 }
1371
1372 inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) {
1373 uint32_t mask = 0x1ffffff;
1374 __m256i reg_shifts, reg_inls, reg_masks;
1375 __m256i results;
1376
1377 reg_masks = _mm256_set1_epi32(mask);
1378
1379 // shift the first 8 outs
1380 reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
1381 0, 0, 0, 0);
1382 reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
1383 in[3] >> 29 | in[4] << 3, in[3],
1384 in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
1385 in[0] >> 25 | in[1] << 7, in[0]);
1386 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1387 _mm256_storeu_si256((__m256i*)(out), results);
1388 out += 8;
1389
1390 // shift the second 8 outs
1391 reg_shifts = _mm256_set_epi32(0, 0, 5, 0,
1392 0, 0, 1, 0);
1393 reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
1394 in[10], in[9] >> 12 | in[10] << 20,
1395 in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
1396 in[7], in[6] >> 8 | in[7] << 24);
1397 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1398 _mm256_storeu_si256((__m256i*)(out), results);
1399 out += 8;
1400
1401 // shift the third 8 outs
1402 reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
1403 0, 2, 0, 0);
1404 reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17],
1405 in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
1406 in[14] >> 27 | in[15] << 5, in[14],
1407 in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
1408 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1409 _mm256_storeu_si256((__m256i*)(out), results);
1410 out += 8;
1411
1412 // shift the last 8 outs
1413 reg_shifts = _mm256_set_epi32(7, 0, 0, 0,
1414 3, 0, 0, 0);
1415 reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
1416 in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
1417 in[21], in[20] >> 10 | in[21] << 22,
1418 in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8);
1419 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1420 _mm256_storeu_si256((__m256i*)(out), results);
1421 out += 8;
1422
1423 in += 25;
1424
1425 return in;
1426 }
1427
1428 inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) {
1429 uint32_t mask = 0x3ffffff;
1430 __m256i reg_shifts, reg_inls, reg_masks;
1431 __m256i results;
1432
1433 reg_masks = _mm256_set1_epi32(mask);
1434
1435 // shift the first 8 outs
1436 reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
1437 0, 0, 0, 0);
1438 reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
1439 in[4], in[3] >> 8 | in[4] << 24,
1440 in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
1441 in[0] >> 26 | in[1] << 6, in[0]);
1442 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1443 _mm256_storeu_si256((__m256i*)(out), results);
1444 out += 8;
1445
1446 // shift the second 8 outs
1447 reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
1448 0, 4, 0, 0);
1449 reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
1450 in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
1451 in[8] >> 30 | in[9] << 2, in[8],
1452 in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16);
1453 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1454 _mm256_storeu_si256((__m256i*)(out), results);
1455 out += 8;
1456
1457 // shift the third 8 outs
1458 reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
1459 0, 0, 0, 0);
1460 reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
1461 in[17], in[16] >> 8 | in[17] << 24,
1462 in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
1463 in[13] >> 26 | in[14] << 6, in[13]);
1464 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1465 _mm256_storeu_si256((__m256i*)(out), results);
1466 out += 8;
1467
1468 // shift the last 8 outs
1469 reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
1470 0, 4, 0, 0);
1471 reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
1472 in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
1473 in[21] >> 30 | in[22] << 2, in[21],
1474 in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16);
1475 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1476 _mm256_storeu_si256((__m256i*)(out), results);
1477 out += 8;
1478
1479 in += 26;
1480
1481 return in;
1482 }
1483
1484 inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) {
1485 uint32_t mask = 0x7ffffff;
1486 __m256i reg_shifts, reg_inls, reg_masks;
1487 __m256i results;
1488
1489 reg_masks = _mm256_set1_epi32(mask);
1490
1491 // shift the first 8 outs
1492 reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
1493 0, 0, 0, 0);
1494 reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5],
1495 in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
1496 in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
1497 in[0] >> 27 | in[1] << 5, in[0]);
1498 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1499 _mm256_storeu_si256((__m256i*)(out), results);
1500 out += 8;
1501
1502 // shift the second 8 outs
1503 reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
1504 0, 0, 0, 0);
1505 reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
1506 in[10] >> 31 | in[11] << 1, in[10],
1507 in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
1508 in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8);
1509 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1510 _mm256_storeu_si256((__m256i*)(out), results);
1511 out += 8;
1512
1513 // shift the third 8 outs
1514 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1515 1, 0, 0, 0);
1516 reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
1517 in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
1518 in[16], in[15] >> 6 | in[16] << 26,
1519 in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
1520 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1521 _mm256_storeu_si256((__m256i*)(out), results);
1522 out += 8;
1523
1524 // shift the last 8 outs
1525 reg_shifts = _mm256_set_epi32(5, 0, 0, 0,
1526 0, 0, 3, 0);
1527 reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
1528 in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
1529 in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
1530 in[21], in[20] >> 8 | in[21] << 24);
1531 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1532 _mm256_storeu_si256((__m256i*)(out), results);
1533 out += 8;
1534
1535 in += 27;
1536
1537 return in;
1538 }
1539
1540 inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) {
1541 uint32_t mask = 0xfffffff;
1542 __m256i reg_shifts, reg_inls, reg_masks;
1543 __m256i results;
1544
1545 reg_masks = _mm256_set1_epi32(mask);
1546
1547 // shift the first 8 outs
1548 reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
1549 0, 0, 0, 0);
1550 reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24,
1551 in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
1552 in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
1553 in[0] >> 28 | in[1] << 4, in[0]);
1554 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1555 _mm256_storeu_si256((__m256i*)(out), results);
1556 out += 8;
1557
1558 // shift the second 8 outs
1559 reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
1560 0, 0, 0, 0);
1561 reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
1562 in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
1563 in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
1564 in[7] >> 28 | in[8] << 4, in[7]);
1565 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1566 _mm256_storeu_si256((__m256i*)(out), results);
1567 out += 8;
1568
1569 // shift the third 8 outs
1570 reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
1571 0, 0, 0, 0);
1572 reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24,
1573 in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
1574 in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
1575 in[14] >> 28 | in[15] << 4, in[14]);
1576 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1577 _mm256_storeu_si256((__m256i*)(out), results);
1578 out += 8;
1579
1580 // shift the last 8 outs
1581 reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
1582 0, 0, 0, 0);
1583 reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
1584 in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
1585 in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
1586 in[21] >> 28 | in[22] << 4, in[21]);
1587 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1588 _mm256_storeu_si256((__m256i*)(out), results);
1589 out += 8;
1590
1591 in += 28;
1592
1593 return in;
1594 }
1595
1596 inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) {
1597 uint32_t mask = 0x1fffffff;
1598 __m256i reg_shifts, reg_inls, reg_masks;
1599 __m256i results;
1600
1601 reg_masks = _mm256_set1_epi32(mask);
1602
1603 // shift the first 8 outs
1604 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1605 0, 0, 0, 0);
1606 reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
1607 in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
1608 in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
1609 in[0] >> 29 | in[1] << 3, in[0]);
1610 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1611 _mm256_storeu_si256((__m256i*)(out), results);
1612 out += 8;
1613
1614 // shift the second 8 outs
1615 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1616 0, 2, 0, 0);
1617 reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
1618 in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
1619 in[9] >> 31 | in[10] << 1, in[9],
1620 in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24);
1621 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1622 _mm256_storeu_si256((__m256i*)(out), results);
1623 out += 8;
1624
1625 // shift the third 8 outs
1626 reg_shifts = _mm256_set_epi32(0, 0, 1, 0,
1627 0, 0, 0, 0);
1628 reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
1629 in[19], in[18] >> 4 | in[19] << 28,
1630 in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
1631 in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
1632 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1633 _mm256_storeu_si256((__m256i*)(out), results);
1634 out += 8;
1635
1636 // shift the last 8 outs
1637 reg_shifts = _mm256_set_epi32(3, 0, 0, 0,
1638 0, 0, 0, 0);
1639 reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
1640 in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
1641 in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
1642 in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8);
1643 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1644 _mm256_storeu_si256((__m256i*)(out), results);
1645 out += 8;
1646
1647 in += 29;
1648
1649 return in;
1650 }
1651
1652 inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) {
1653 uint32_t mask = 0x3fffffff;
1654 __m256i reg_shifts, reg_inls, reg_masks;
1655 __m256i results;
1656
1657 reg_masks = _mm256_set1_epi32(mask);
1658
1659 // shift the first 8 outs
1660 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1661 0, 0, 0, 0);
1662 reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
1663 in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
1664 in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
1665 in[0] >> 30 | in[1] << 2, in[0]);
1666 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1667 _mm256_storeu_si256((__m256i*)(out), results);
1668 out += 8;
1669
1670 // shift the second 8 outs
1671 reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
1672 0, 0, 0, 0);
1673 reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
1674 in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
1675 in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
1676 in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16);
1677 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1678 _mm256_storeu_si256((__m256i*)(out), results);
1679 out += 8;
1680
1681 // shift the third 8 outs
1682 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1683 0, 0, 0, 0);
1684 reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
1685 in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
1686 in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
1687 in[15] >> 30 | in[16] << 2, in[15]);
1688 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1689 _mm256_storeu_si256((__m256i*)(out), results);
1690 out += 8;
1691
1692 // shift the last 8 outs
1693 reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
1694 0, 0, 0, 0);
1695 reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
1696 in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
1697 in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
1698 in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16);
1699 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1700 _mm256_storeu_si256((__m256i*)(out), results);
1701 out += 8;
1702
1703 in += 30;
1704
1705 return in;
1706 }
1707
1708 inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) {
1709 uint32_t mask = 0x7fffffff;
1710 __m256i reg_shifts, reg_inls, reg_masks;
1711 __m256i results;
1712
1713 reg_masks = _mm256_set1_epi32(mask);
1714
1715 // shift the first 8 outs
1716 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1717 0, 0, 0, 0);
1718 reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
1719 in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
1720 in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
1721 in[0] >> 31 | in[1] << 1, in[0]);
1722 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1723 _mm256_storeu_si256((__m256i*)(out), results);
1724 out += 8;
1725
1726 // shift the second 8 outs
1727 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1728 0, 0, 0, 0);
1729 reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
1730 in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
1731 in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
1732 in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8);
1733 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1734 _mm256_storeu_si256((__m256i*)(out), results);
1735 out += 8;
1736
1737 // shift the third 8 outs
1738 reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1739 0, 0, 0, 0);
1740 reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
1741 in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
1742 in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
1743 in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
1744 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1745 _mm256_storeu_si256((__m256i*)(out), results);
1746 out += 8;
1747
1748 // shift the last 8 outs
1749 reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
1750 0, 0, 0, 0);
1751 reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
1752 in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
1753 in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
1754 in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24);
1755 results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
1756 _mm256_storeu_si256((__m256i*)(out), results);
1757 out += 8;
1758
1759 in += 31;
1760
1761 return in;
1762 }
1763
1764 inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
1765 memcpy(out, in, 32 * sizeof(*out));
1766 in += 32;
1767 out += 32;
1768
1769 return in;
1770 }
1771
1772 int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
1773 batch_size = batch_size / 32 * 32;
1774 int num_loops = batch_size / 32;
1775
1776 switch (num_bits) {
1777 case 0:
1778 for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32);
1779 break;
1780 case 1:
1781 for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32);
1782 break;
1783 case 2:
1784 for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32);
1785 break;
1786 case 3:
1787 for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32);
1788 break;
1789 case 4:
1790 for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32);
1791 break;
1792 case 5:
1793 for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32);
1794 break;
1795 case 6:
1796 for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32);
1797 break;
1798 case 7:
1799 for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32);
1800 break;
1801 case 8:
1802 for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32);
1803 break;
1804 case 9:
1805 for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32);
1806 break;
1807 case 10:
1808 for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32);
1809 break;
1810 case 11:
1811 for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32);
1812 break;
1813 case 12:
1814 for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32);
1815 break;
1816 case 13:
1817 for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32);
1818 break;
1819 case 14:
1820 for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32);
1821 break;
1822 case 15:
1823 for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32);
1824 break;
1825 case 16:
1826 for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32);
1827 break;
1828 case 17:
1829 for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32);
1830 break;
1831 case 18:
1832 for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32);
1833 break;
1834 case 19:
1835 for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32);
1836 break;
1837 case 20:
1838 for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32);
1839 break;
1840 case 21:
1841 for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32);
1842 break;
1843 case 22:
1844 for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32);
1845 break;
1846 case 23:
1847 for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32);
1848 break;
1849 case 24:
1850 for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32);
1851 break;
1852 case 25:
1853 for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32);
1854 break;
1855 case 26:
1856 for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32);
1857 break;
1858 case 27:
1859 for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32);
1860 break;
1861 case 28:
1862 for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32);
1863 break;
1864 case 29:
1865 for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32);
1866 break;
1867 case 30:
1868 for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32);
1869 break;
1870 case 31:
1871 for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32);
1872 break;
1873 case 32:
1874 for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32);
1875 break;
1876 }
1877
1878 return batch_size;
1879 }