]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/go/parquet/internal/utils/_lib/bit_packing_avx2.c
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
18 #include <immintrin.h>
21 inline const uint32_t* unpack0_32_avx2(const uint32_t* in
, uint32_t* out
) {
22 memset(out
, 0x0, 32 * sizeof(*out
));
28 inline static const uint32_t* unpack1_32_avx2(const uint32_t* in
, uint32_t* out
) {
30 __m256i reg_shifts
, reg_inls
, reg_masks
;
33 reg_masks
= _mm256_set1_epi32(mask
);
35 // shift the first 8 outs
36 reg_shifts
= _mm256_set_epi32(7, 6, 5, 4,
38 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
42 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
43 _mm256_storeu_si256((__m256i
*)(out
), results
);
46 // shift the second 8 outs
47 reg_shifts
= _mm256_set_epi32(15, 14, 13, 12,
49 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
53 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
54 _mm256_storeu_si256((__m256i
*)(out
), results
);
57 // shift the third 8 outs
58 reg_shifts
= _mm256_set_epi32(23, 22, 21, 20,
60 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
64 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
65 _mm256_storeu_si256((__m256i
*)(out
), results
);
68 // shift the last 8 outs
69 reg_shifts
= _mm256_set_epi32(31, 30, 29, 28,
71 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
75 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
76 _mm256_storeu_si256((__m256i
*)(out
), results
);
84 inline static const uint32_t* unpack2_32_avx2(const uint32_t* in
, uint32_t* out
) {
86 __m256i reg_shifts
, reg_inls
, reg_masks
;
89 reg_masks
= _mm256_set1_epi32(mask
);
91 // shift the first 8 outs
92 reg_shifts
= _mm256_set_epi32(14, 12, 10, 8,
94 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
98 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
99 _mm256_storeu_si256((__m256i
*)(out
), results
);
102 // shift the second 8 outs
103 reg_shifts
= _mm256_set_epi32(30, 28, 26, 24,
105 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
109 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
110 _mm256_storeu_si256((__m256i
*)(out
), results
);
113 // shift the third 8 outs
114 reg_shifts
= _mm256_set_epi32(14, 12, 10, 8,
116 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
120 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
121 _mm256_storeu_si256((__m256i
*)(out
), results
);
124 // shift the last 8 outs
125 reg_shifts
= _mm256_set_epi32(30, 28, 26, 24,
127 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
131 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
132 _mm256_storeu_si256((__m256i
*)(out
), results
);
140 inline static const uint32_t* unpack3_32_avx2(const uint32_t* in
, uint32_t* out
) {
142 __m256i reg_shifts
, reg_inls
, reg_masks
;
145 reg_masks
= _mm256_set1_epi32(mask
);
147 // shift the first 8 outs
148 reg_shifts
= _mm256_set_epi32(21, 18, 15, 12,
150 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
154 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
155 _mm256_storeu_si256((__m256i
*)(out
), results
);
158 // shift the second 8 outs
159 reg_shifts
= _mm256_set_epi32(13, 10, 7, 4,
161 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
163 in
[1], in
[0] >> 30 | in
[1] << 2,
165 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
166 _mm256_storeu_si256((__m256i
*)(out
), results
);
169 // shift the third 8 outs
170 reg_shifts
= _mm256_set_epi32(5, 2, 0, 28,
172 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
173 in
[1] >> 31 | in
[2] << 1, in
[1],
176 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
177 _mm256_storeu_si256((__m256i
*)(out
), results
);
180 // shift the last 8 outs
181 reg_shifts
= _mm256_set_epi32(29, 26, 23, 20,
183 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
187 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
188 _mm256_storeu_si256((__m256i
*)(out
), results
);
196 inline static const uint32_t* unpack4_32_avx2(const uint32_t* in
, uint32_t* out
) {
198 __m256i reg_shifts
, reg_inls
, reg_masks
;
201 reg_masks
= _mm256_set1_epi32(mask
);
203 // shift the first 8 outs
204 reg_shifts
= _mm256_set_epi32(28, 24, 20, 16,
206 reg_inls
= _mm256_set_epi32(in
[0], in
[0],
210 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
211 _mm256_storeu_si256((__m256i
*)(out
), results
);
214 // shift the second 8 outs
215 reg_shifts
= _mm256_set_epi32(28, 24, 20, 16,
217 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
221 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
222 _mm256_storeu_si256((__m256i
*)(out
), results
);
225 // shift the third 8 outs
226 reg_shifts
= _mm256_set_epi32(28, 24, 20, 16,
228 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
232 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
233 _mm256_storeu_si256((__m256i
*)(out
), results
);
236 // shift the last 8 outs
237 reg_shifts
= _mm256_set_epi32(28, 24, 20, 16,
239 reg_inls
= _mm256_set_epi32(in
[3], in
[3],
243 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
244 _mm256_storeu_si256((__m256i
*)(out
), results
);
252 inline static const uint32_t* unpack5_32_avx2(const uint32_t* in
, uint32_t* out
) {
253 uint32_t mask
= 0x1f;
254 __m256i reg_shifts
, reg_inls
, reg_masks
;
257 reg_masks
= _mm256_set1_epi32(mask
);
259 // shift the first 8 outs
260 reg_shifts
= _mm256_set_epi32(3, 0, 25, 20,
262 reg_inls
= _mm256_set_epi32(in
[1], in
[0] >> 30 | in
[1] << 2,
266 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
267 _mm256_storeu_si256((__m256i
*)(out
), results
);
270 // shift the second 8 outs
271 reg_shifts
= _mm256_set_epi32(11, 6, 1, 0,
273 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
274 in
[2], in
[1] >> 28 | in
[2] << 4,
277 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
278 _mm256_storeu_si256((__m256i
*)(out
), results
);
281 // shift the third 8 outs
282 reg_shifts
= _mm256_set_epi32(19, 14, 9, 4,
284 reg_inls
= _mm256_set_epi32(in
[3], in
[3],
286 in
[2] >> 31 | in
[3] << 1, in
[2],
288 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
289 _mm256_storeu_si256((__m256i
*)(out
), results
);
292 // shift the last 8 outs
293 reg_shifts
= _mm256_set_epi32(27, 22, 17, 12,
295 reg_inls
= _mm256_set_epi32(in
[4], in
[4],
298 in
[3] >> 29 | in
[4] << 3, in
[3]);
299 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
300 _mm256_storeu_si256((__m256i
*)(out
), results
);
308 inline static const uint32_t* unpack6_32_avx2(const uint32_t* in
, uint32_t* out
) {
309 uint32_t mask
= 0x3f;
310 __m256i reg_shifts
, reg_inls
, reg_masks
;
313 reg_masks
= _mm256_set1_epi32(mask
);
315 // shift the first 8 outs
316 reg_shifts
= _mm256_set_epi32(10, 4, 0, 24,
318 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
319 in
[0] >> 30 | in
[1] << 2, in
[0],
322 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
323 _mm256_storeu_si256((__m256i
*)(out
), results
);
326 // shift the second 8 outs
327 reg_shifts
= _mm256_set_epi32(26, 20, 14, 8,
329 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
331 in
[2], in
[1] >> 28 | in
[2] << 4,
333 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
334 _mm256_storeu_si256((__m256i
*)(out
), results
);
337 // shift the third 8 outs
338 reg_shifts
= _mm256_set_epi32(10, 4, 0, 24,
340 reg_inls
= _mm256_set_epi32(in
[4], in
[4],
341 in
[3] >> 30 | in
[4] << 2, in
[3],
344 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
345 _mm256_storeu_si256((__m256i
*)(out
), results
);
348 // shift the last 8 outs
349 reg_shifts
= _mm256_set_epi32(26, 20, 14, 8,
351 reg_inls
= _mm256_set_epi32(in
[5], in
[5],
353 in
[5], in
[4] >> 28 | in
[5] << 4,
355 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
356 _mm256_storeu_si256((__m256i
*)(out
), results
);
364 inline static const uint32_t* unpack7_32_avx2(const uint32_t* in
, uint32_t* out
) {
365 uint32_t mask
= 0x7f;
366 __m256i reg_shifts
, reg_inls
, reg_masks
;
369 reg_masks
= _mm256_set1_epi32(mask
);
371 // shift the first 8 outs
372 reg_shifts
= _mm256_set_epi32(17, 10, 3, 0,
374 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
375 in
[1], in
[0] >> 28 | in
[1] << 4,
378 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
379 _mm256_storeu_si256((__m256i
*)(out
), results
);
382 // shift the second 8 outs
383 reg_shifts
= _mm256_set_epi32(9, 2, 0, 20,
385 reg_inls
= _mm256_set_epi32(in
[3], in
[3],
386 in
[2] >> 27 | in
[3] << 5, in
[2],
388 in
[1] >> 31 | in
[2] << 1, in
[1]);
389 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
390 _mm256_storeu_si256((__m256i
*)(out
), results
);
393 // shift the third 8 outs
394 reg_shifts
= _mm256_set_epi32(1, 0, 19, 12,
396 reg_inls
= _mm256_set_epi32(in
[5], in
[4] >> 26 | in
[5] << 6,
398 in
[4], in
[3] >> 30 | in
[4] << 2,
400 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
401 _mm256_storeu_si256((__m256i
*)(out
), results
);
404 // shift the last 8 outs
405 reg_shifts
= _mm256_set_epi32(25, 18, 11, 4,
407 reg_inls
= _mm256_set_epi32(in
[6], in
[6],
409 in
[5] >> 29 | in
[6] << 3, in
[5],
411 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
412 _mm256_storeu_si256((__m256i
*)(out
), results
);
420 inline static const uint32_t* unpack8_32_avx2(const uint32_t* in
, uint32_t* out
) {
421 uint32_t mask
= 0xff;
422 __m256i reg_shifts
, reg_inls
, reg_masks
;
425 reg_masks
= _mm256_set1_epi32(mask
);
427 // shift the first 8 outs
428 reg_shifts
= _mm256_set_epi32(24, 16, 8, 0,
430 reg_inls
= _mm256_set_epi32(in
[1], in
[1],
434 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
435 _mm256_storeu_si256((__m256i
*)(out
), results
);
438 // shift the second 8 outs
439 reg_shifts
= _mm256_set_epi32(24, 16, 8, 0,
441 reg_inls
= _mm256_set_epi32(in
[3], in
[3],
445 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
446 _mm256_storeu_si256((__m256i
*)(out
), results
);
449 // shift the third 8 outs
450 reg_shifts
= _mm256_set_epi32(24, 16, 8, 0,
452 reg_inls
= _mm256_set_epi32(in
[5], in
[5],
456 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
457 _mm256_storeu_si256((__m256i
*)(out
), results
);
460 // shift the last 8 outs
461 reg_shifts
= _mm256_set_epi32(24, 16, 8, 0,
463 reg_inls
= _mm256_set_epi32(in
[7], in
[7],
467 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
468 _mm256_storeu_si256((__m256i
*)(out
), results
);
476 inline static const uint32_t* unpack9_32_avx2(const uint32_t* in
, uint32_t* out
) {
477 uint32_t mask
= 0x1ff;
478 __m256i reg_shifts
, reg_inls
, reg_masks
;
481 reg_masks
= _mm256_set1_epi32(mask
);
483 // shift the first 8 outs
484 reg_shifts
= _mm256_set_epi32(0, 22, 13, 4,
486 reg_inls
= _mm256_set_epi32(in
[1] >> 31 | in
[2] << 1, in
[1],
488 in
[0] >> 27 | in
[1] << 5, in
[0],
490 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
491 _mm256_storeu_si256((__m256i
*)(out
), results
);
494 // shift the second 8 outs
495 reg_shifts
= _mm256_set_epi32(7, 0, 21, 12,
497 reg_inls
= _mm256_set_epi32(in
[4], in
[3] >> 30 | in
[4] << 2,
499 in
[3], in
[2] >> 26 | in
[3] << 6,
501 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
502 _mm256_storeu_si256((__m256i
*)(out
), results
);
505 // shift the third 8 outs
506 reg_shifts
= _mm256_set_epi32(15, 6, 0, 20,
508 reg_inls
= _mm256_set_epi32(in
[6], in
[6],
509 in
[5] >> 29 | in
[6] << 3, in
[5],
511 in
[4] >> 25 | in
[5] << 7, in
[4]);
512 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
513 _mm256_storeu_si256((__m256i
*)(out
), results
);
516 // shift the last 8 outs
517 reg_shifts
= _mm256_set_epi32(23, 14, 5, 0,
519 reg_inls
= _mm256_set_epi32(in
[8], in
[8],
520 in
[8], in
[7] >> 28 | in
[8] << 4,
522 in
[7], in
[6] >> 24 | in
[7] << 8);
523 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
524 _mm256_storeu_si256((__m256i
*)(out
), results
);
532 inline static const uint32_t* unpack10_32_avx2(const uint32_t* in
, uint32_t* out
) {
533 uint32_t mask
= 0x3ff;
534 __m256i reg_shifts
, reg_inls
, reg_masks
;
537 reg_masks
= _mm256_set1_epi32(mask
);
539 // shift the first 8 outs
540 reg_shifts
= _mm256_set_epi32(6, 0, 18, 8,
542 reg_inls
= _mm256_set_epi32(in
[2], in
[1] >> 28 | in
[2] << 4,
544 in
[0] >> 30 | in
[1] << 2, in
[0],
546 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
547 _mm256_storeu_si256((__m256i
*)(out
), results
);
550 // shift the second 8 outs
551 reg_shifts
= _mm256_set_epi32(22, 12, 2, 0,
553 reg_inls
= _mm256_set_epi32(in
[4], in
[4],
554 in
[4], in
[3] >> 24 | in
[4] << 8,
556 in
[2] >> 26 | in
[3] << 6, in
[2]);
557 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
558 _mm256_storeu_si256((__m256i
*)(out
), results
);
561 // shift the third 8 outs
562 reg_shifts
= _mm256_set_epi32(6, 0, 18, 8,
564 reg_inls
= _mm256_set_epi32(in
[7], in
[6] >> 28 | in
[7] << 4,
566 in
[5] >> 30 | in
[6] << 2, in
[5],
568 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
569 _mm256_storeu_si256((__m256i
*)(out
), results
);
572 // shift the last 8 outs
573 reg_shifts
= _mm256_set_epi32(22, 12, 2, 0,
575 reg_inls
= _mm256_set_epi32(in
[9], in
[9],
576 in
[9], in
[8] >> 24 | in
[9] << 8,
578 in
[7] >> 26 | in
[8] << 6, in
[7]);
579 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
580 _mm256_storeu_si256((__m256i
*)(out
), results
);
588 inline static const uint32_t* unpack11_32_avx2(const uint32_t* in
, uint32_t* out
) {
589 uint32_t mask
= 0x7ff;
590 __m256i reg_shifts
, reg_inls
, reg_masks
;
593 reg_masks
= _mm256_set1_epi32(mask
);
595 // shift the first 8 outs
596 reg_shifts
= _mm256_set_epi32(13, 2, 0, 12,
598 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
599 in
[1] >> 23 | in
[2] << 9, in
[1],
600 in
[1], in
[0] >> 22 | in
[1] << 10,
602 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
603 _mm256_storeu_si256((__m256i
*)(out
), results
);
606 // shift the second 8 outs
607 reg_shifts
= _mm256_set_epi32(5, 0, 15, 4,
609 reg_inls
= _mm256_set_epi32(in
[5], in
[4] >> 26 | in
[5] << 6,
611 in
[3] >> 25 | in
[4] << 7, in
[3],
612 in
[3], in
[2] >> 24 | in
[3] << 8);
613 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
614 _mm256_storeu_si256((__m256i
*)(out
), results
);
617 // shift the third 8 outs
618 reg_shifts
= _mm256_set_epi32(0, 18, 7, 0,
620 reg_inls
= _mm256_set_epi32(in
[7] >> 29 | in
[8] << 3, in
[7],
621 in
[7], in
[6] >> 28 | in
[7] << 4,
623 in
[5] >> 27 | in
[6] << 5, in
[5]);
624 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
625 _mm256_storeu_si256((__m256i
*)(out
), results
);
628 // shift the last 8 outs
629 reg_shifts
= _mm256_set_epi32(21, 10, 0, 20,
631 reg_inls
= _mm256_set_epi32(in
[10], in
[10],
632 in
[9] >> 31 | in
[10] << 1, in
[9],
633 in
[9], in
[8] >> 30 | in
[9] << 2,
635 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
636 _mm256_storeu_si256((__m256i
*)(out
), results
);
644 inline static const uint32_t* unpack12_32_avx2(const uint32_t* in
, uint32_t* out
) {
645 uint32_t mask
= 0xfff;
646 __m256i reg_shifts
, reg_inls
, reg_masks
;
649 reg_masks
= _mm256_set1_epi32(mask
);
651 // shift the first 8 outs
652 reg_shifts
= _mm256_set_epi32(20, 8, 0, 16,
654 reg_inls
= _mm256_set_epi32(in
[2], in
[2],
655 in
[1] >> 28 | in
[2] << 4, in
[1],
656 in
[1], in
[0] >> 24 | in
[1] << 8,
658 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
659 _mm256_storeu_si256((__m256i
*)(out
), results
);
662 // shift the second 8 outs
663 reg_shifts
= _mm256_set_epi32(20, 8, 0, 16,
665 reg_inls
= _mm256_set_epi32(in
[5], in
[5],
666 in
[4] >> 28 | in
[5] << 4, in
[4],
667 in
[4], in
[3] >> 24 | in
[4] << 8,
669 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
670 _mm256_storeu_si256((__m256i
*)(out
), results
);
673 // shift the third 8 outs
674 reg_shifts
= _mm256_set_epi32(20, 8, 0, 16,
676 reg_inls
= _mm256_set_epi32(in
[8], in
[8],
677 in
[7] >> 28 | in
[8] << 4, in
[7],
678 in
[7], in
[6] >> 24 | in
[7] << 8,
680 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
681 _mm256_storeu_si256((__m256i
*)(out
), results
);
684 // shift the last 8 outs
685 reg_shifts
= _mm256_set_epi32(20, 8, 0, 16,
687 reg_inls
= _mm256_set_epi32(in
[11], in
[11],
688 in
[10] >> 28 | in
[11] << 4, in
[10],
689 in
[10], in
[9] >> 24 | in
[10] << 8,
691 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
692 _mm256_storeu_si256((__m256i
*)(out
), results
);
700 inline static const uint32_t* unpack13_32_avx2(const uint32_t* in
, uint32_t* out
) {
701 uint32_t mask
= 0x1fff;
702 __m256i reg_shifts
, reg_inls
, reg_masks
;
705 reg_masks
= _mm256_set1_epi32(mask
);
707 // shift the first 8 outs
708 reg_shifts
= _mm256_set_epi32(0, 14, 1, 0,
710 reg_inls
= _mm256_set_epi32(in
[2] >> 27 | in
[3] << 5, in
[2],
711 in
[2], in
[1] >> 20 | in
[2] << 12,
712 in
[1], in
[0] >> 26 | in
[1] << 6,
714 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
715 _mm256_storeu_si256((__m256i
*)(out
), results
);
718 // shift the second 8 outs
719 reg_shifts
= _mm256_set_epi32(3, 0, 9, 0,
721 reg_inls
= _mm256_set_epi32(in
[6], in
[5] >> 22 | in
[6] << 10,
722 in
[5], in
[4] >> 28 | in
[5] << 4,
724 in
[3] >> 21 | in
[4] << 11, in
[3]);
725 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
726 _mm256_storeu_si256((__m256i
*)(out
), results
);
729 // shift the third 8 outs
730 reg_shifts
= _mm256_set_epi32(11, 0, 17, 4,
732 reg_inls
= _mm256_set_epi32(in
[9], in
[8] >> 30 | in
[9] << 2,
734 in
[7] >> 23 | in
[8] << 9, in
[7],
735 in
[6] >> 29 | in
[7] << 3, in
[6]);
736 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
737 _mm256_storeu_si256((__m256i
*)(out
), results
);
740 // shift the last 8 outs
741 reg_shifts
= _mm256_set_epi32(19, 6, 0, 12,
743 reg_inls
= _mm256_set_epi32(in
[12], in
[12],
744 in
[11] >> 25 | in
[12] << 7, in
[11],
745 in
[10] >> 31 | in
[11] << 1, in
[10],
746 in
[10], in
[9] >> 24 | in
[10] << 8);
747 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
748 _mm256_storeu_si256((__m256i
*)(out
), results
);
756 inline static const uint32_t* unpack14_32_avx2(const uint32_t* in
, uint32_t* out
) {
757 uint32_t mask
= 0x3fff;
758 __m256i reg_shifts
, reg_inls
, reg_masks
;
761 reg_masks
= _mm256_set1_epi32(mask
);
763 // shift the first 8 outs
764 reg_shifts
= _mm256_set_epi32(2, 0, 6, 0,
766 reg_inls
= _mm256_set_epi32(in
[3], in
[2] >> 20 | in
[3] << 12,
767 in
[2], in
[1] >> 24 | in
[2] << 8,
768 in
[1], in
[0] >> 28 | in
[1] << 4,
770 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
771 _mm256_storeu_si256((__m256i
*)(out
), results
);
774 // shift the second 8 outs
775 reg_shifts
= _mm256_set_epi32(18, 4, 0, 8,
777 reg_inls
= _mm256_set_epi32(in
[6], in
[6],
778 in
[5] >> 22 | in
[6] << 10, in
[5],
779 in
[4] >> 26 | in
[5] << 6, in
[4],
780 in
[3] >> 30 | in
[4] << 2, in
[3]);
781 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
782 _mm256_storeu_si256((__m256i
*)(out
), results
);
785 // shift the third 8 outs
786 reg_shifts
= _mm256_set_epi32(2, 0, 6, 0,
788 reg_inls
= _mm256_set_epi32(in
[10], in
[9] >> 20 | in
[10] << 12,
789 in
[9], in
[8] >> 24 | in
[9] << 8,
790 in
[8], in
[7] >> 28 | in
[8] << 4,
792 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
793 _mm256_storeu_si256((__m256i
*)(out
), results
);
796 // shift the last 8 outs
797 reg_shifts
= _mm256_set_epi32(18, 4, 0, 8,
799 reg_inls
= _mm256_set_epi32(in
[13], in
[13],
800 in
[12] >> 22 | in
[13] << 10, in
[12],
801 in
[11] >> 26 | in
[12] << 6, in
[11],
802 in
[10] >> 30 | in
[11] << 2, in
[10]);
803 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
804 _mm256_storeu_si256((__m256i
*)(out
), results
);
812 inline static const uint32_t* unpack15_32_avx2(const uint32_t* in
, uint32_t* out
) {
813 uint32_t mask
= 0x7fff;
814 __m256i reg_shifts
, reg_inls
, reg_masks
;
817 reg_masks
= _mm256_set1_epi32(mask
);
819 // shift the first 8 outs
820 reg_shifts
= _mm256_set_epi32(9, 0, 11, 0,
822 reg_inls
= _mm256_set_epi32(in
[3], in
[2] >> 26 | in
[3] << 6,
823 in
[2], in
[1] >> 28 | in
[2] << 4,
824 in
[1], in
[0] >> 30 | in
[1] << 2,
826 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
827 _mm256_storeu_si256((__m256i
*)(out
), results
);
830 // shift the second 8 outs
831 reg_shifts
= _mm256_set_epi32(1, 0, 3, 0,
833 reg_inls
= _mm256_set_epi32(in
[7], in
[6] >> 18 | in
[7] << 14,
834 in
[6], in
[5] >> 20 | in
[6] << 12,
835 in
[5], in
[4] >> 22 | in
[5] << 10,
836 in
[4], in
[3] >> 24 | in
[4] << 8);
837 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
838 _mm256_storeu_si256((__m256i
*)(out
), results
);
841 // shift the third 8 outs
842 reg_shifts
= _mm256_set_epi32(0, 10, 0, 12,
844 reg_inls
= _mm256_set_epi32(in
[10] >> 25 | in
[11] << 7, in
[10],
845 in
[9] >> 27 | in
[10] << 5, in
[9],
846 in
[8] >> 29 | in
[9] << 3, in
[8],
847 in
[7] >> 31 | in
[8] << 1, in
[7]);
848 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
849 _mm256_storeu_si256((__m256i
*)(out
), results
);
852 // shift the last 8 outs
853 reg_shifts
= _mm256_set_epi32(17, 2, 0, 4,
855 reg_inls
= _mm256_set_epi32(in
[14], in
[14],
856 in
[13] >> 19 | in
[14] << 13, in
[13],
857 in
[12] >> 21 | in
[13] << 11, in
[12],
858 in
[11] >> 23 | in
[12] << 9, in
[11]);
859 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
860 _mm256_storeu_si256((__m256i
*)(out
), results
);
868 inline static const uint32_t* unpack16_32_avx2(const uint32_t* in
, uint32_t* out
) {
869 uint32_t mask
= 0xffff;
870 __m256i reg_shifts
, reg_inls
, reg_masks
;
873 reg_masks
= _mm256_set1_epi32(mask
);
875 // shift the first 8 outs
876 reg_shifts
= _mm256_set_epi32(16, 0, 16, 0,
878 reg_inls
= _mm256_set_epi32(in
[3], in
[3],
882 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
883 _mm256_storeu_si256((__m256i
*)(out
), results
);
886 // shift the second 8 outs
887 reg_shifts
= _mm256_set_epi32(16, 0, 16, 0,
889 reg_inls
= _mm256_set_epi32(in
[7], in
[7],
893 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
894 _mm256_storeu_si256((__m256i
*)(out
), results
);
897 // shift the third 8 outs
898 reg_shifts
= _mm256_set_epi32(16, 0, 16, 0,
900 reg_inls
= _mm256_set_epi32(in
[11], in
[11],
904 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
905 _mm256_storeu_si256((__m256i
*)(out
), results
);
908 // shift the last 8 outs
909 reg_shifts
= _mm256_set_epi32(16, 0, 16, 0,
911 reg_inls
= _mm256_set_epi32(in
[15], in
[15],
915 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
916 _mm256_storeu_si256((__m256i
*)(out
), results
);
924 inline static const uint32_t* unpack17_32_avx2(const uint32_t* in
, uint32_t* out
) {
925 uint32_t mask
= 0x1ffff;
926 __m256i reg_shifts
, reg_inls
, reg_masks
;
929 reg_masks
= _mm256_set1_epi32(mask
);
931 // shift the first 8 outs
932 reg_shifts
= _mm256_set_epi32(0, 6, 0, 4,
934 reg_inls
= _mm256_set_epi32(in
[3] >> 23 | in
[4] << 9, in
[3],
935 in
[2] >> 21 | in
[3] << 11, in
[2],
936 in
[1] >> 19 | in
[2] << 13, in
[1],
937 in
[0] >> 17 | in
[1] << 15, in
[0]);
938 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
939 _mm256_storeu_si256((__m256i
*)(out
), results
);
942 // shift the second 8 outs
943 reg_shifts
= _mm256_set_epi32(0, 14, 0, 12,
945 reg_inls
= _mm256_set_epi32(in
[7] >> 31 | in
[8] << 1, in
[7],
946 in
[6] >> 29 | in
[7] << 3, in
[6],
947 in
[5] >> 27 | in
[6] << 5, in
[5],
948 in
[4] >> 25 | in
[5] << 7, in
[4]);
949 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
950 _mm256_storeu_si256((__m256i
*)(out
), results
);
953 // shift the third 8 outs
954 reg_shifts
= _mm256_set_epi32(7, 0, 5, 0,
956 reg_inls
= _mm256_set_epi32(in
[12], in
[11] >> 22 | in
[12] << 10,
957 in
[11], in
[10] >> 20 | in
[11] << 12,
958 in
[10], in
[9] >> 18 | in
[10] << 14,
959 in
[9], in
[8] >> 16 | in
[9] << 16);
960 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
961 _mm256_storeu_si256((__m256i
*)(out
), results
);
964 // shift the last 8 outs
965 reg_shifts
= _mm256_set_epi32(15, 0, 13, 0,
967 reg_inls
= _mm256_set_epi32(in
[16], in
[15] >> 30 | in
[16] << 2,
968 in
[15], in
[14] >> 28 | in
[15] << 4,
969 in
[14], in
[13] >> 26 | in
[14] << 6,
970 in
[13], in
[12] >> 24 | in
[13] << 8);
971 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
972 _mm256_storeu_si256((__m256i
*)(out
), results
);
980 inline static const uint32_t* unpack18_32_avx2(const uint32_t* in
, uint32_t* out
) {
981 uint32_t mask
= 0x3ffff;
982 __m256i reg_shifts
, reg_inls
, reg_masks
;
985 reg_masks
= _mm256_set1_epi32(mask
);
987 // shift the first 8 outs
988 reg_shifts
= _mm256_set_epi32(0, 12, 0, 8,
990 reg_inls
= _mm256_set_epi32(in
[3] >> 30 | in
[4] << 2, in
[3],
991 in
[2] >> 26 | in
[3] << 6, in
[2],
992 in
[1] >> 22 | in
[2] << 10, in
[1],
993 in
[0] >> 18 | in
[1] << 14, in
[0]);
994 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
995 _mm256_storeu_si256((__m256i
*)(out
), results
);
998 // shift the second 8 outs
999 reg_shifts
= _mm256_set_epi32(14, 0, 10, 0,
1001 reg_inls
= _mm256_set_epi32(in
[8], in
[7] >> 28 | in
[8] << 4,
1002 in
[7], in
[6] >> 24 | in
[7] << 8,
1003 in
[6], in
[5] >> 20 | in
[6] << 12,
1004 in
[5], in
[4] >> 16 | in
[5] << 16);
1005 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1006 _mm256_storeu_si256((__m256i
*)(out
), results
);
1009 // shift the third 8 outs
1010 reg_shifts
= _mm256_set_epi32(0, 12, 0, 8,
1012 reg_inls
= _mm256_set_epi32(in
[12] >> 30 | in
[13] << 2, in
[12],
1013 in
[11] >> 26 | in
[12] << 6, in
[11],
1014 in
[10] >> 22 | in
[11] << 10, in
[10],
1015 in
[9] >> 18 | in
[10] << 14, in
[9]);
1016 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1017 _mm256_storeu_si256((__m256i
*)(out
), results
);
1020 // shift the last 8 outs
1021 reg_shifts
= _mm256_set_epi32(14, 0, 10, 0,
1023 reg_inls
= _mm256_set_epi32(in
[17], in
[16] >> 28 | in
[17] << 4,
1024 in
[16], in
[15] >> 24 | in
[16] << 8,
1025 in
[15], in
[14] >> 20 | in
[15] << 12,
1026 in
[14], in
[13] >> 16 | in
[14] << 16);
1027 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1028 _mm256_storeu_si256((__m256i
*)(out
), results
);
1036 inline static const uint32_t* unpack19_32_avx2(const uint32_t* in
, uint32_t* out
) {
1037 uint32_t mask
= 0x7ffff;
1038 __m256i reg_shifts
, reg_inls
, reg_masks
;
1041 reg_masks
= _mm256_set1_epi32(mask
);
1043 // shift the first 8 outs
1044 reg_shifts
= _mm256_set_epi32(5, 0, 0, 12,
1046 reg_inls
= _mm256_set_epi32(in
[4], in
[3] >> 18 | in
[4] << 14,
1047 in
[2] >> 31 | in
[3] << 1, in
[2],
1048 in
[1] >> 25 | in
[2] << 7, in
[1],
1049 in
[0] >> 19 | in
[1] << 13, in
[0]);
1050 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1051 _mm256_storeu_si256((__m256i
*)(out
), results
);
1054 // shift the second 8 outs
1055 reg_shifts
= _mm256_set_epi32(0, 10, 0, 4,
1057 reg_inls
= _mm256_set_epi32(in
[8] >> 29 | in
[9] << 3, in
[8],
1058 in
[7] >> 23 | in
[8] << 9, in
[7],
1059 in
[6] >> 17 | in
[7] << 15, in
[5] >> 30 | in
[6] << 2,
1060 in
[5], in
[4] >> 24 | in
[5] << 8);
1061 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1062 _mm256_storeu_si256((__m256i
*)(out
), results
);
1065 // shift the third 8 outs
1066 reg_shifts
= _mm256_set_epi32(0, 2, 0, 0,
1068 reg_inls
= _mm256_set_epi32(in
[13] >> 21 | in
[14] << 11, in
[13],
1069 in
[12] >> 15 | in
[13] << 17, in
[11] >> 28 | in
[12] << 4,
1070 in
[11], in
[10] >> 22 | in
[11] << 10,
1071 in
[10], in
[9] >> 16 | in
[10] << 16);
1072 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1073 _mm256_storeu_si256((__m256i
*)(out
), results
);
1076 // shift the last 8 outs
1077 reg_shifts
= _mm256_set_epi32(13, 0, 7, 0,
1079 reg_inls
= _mm256_set_epi32(in
[18], in
[17] >> 26 | in
[18] << 6,
1080 in
[17], in
[16] >> 20 | in
[17] << 12,
1081 in
[16], in
[15] >> 14 | in
[16] << 18,
1082 in
[14] >> 27 | in
[15] << 5, in
[14]);
1083 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1084 _mm256_storeu_si256((__m256i
*)(out
), results
);
1092 inline static const uint32_t* unpack20_32_avx2(const uint32_t* in
, uint32_t* out
) {
1093 uint32_t mask
= 0xfffff;
1094 __m256i reg_shifts
, reg_inls
, reg_masks
;
1097 reg_masks
= _mm256_set1_epi32(mask
);
1099 // shift the first 8 outs
1100 reg_shifts
= _mm256_set_epi32(12, 0, 4, 0,
1102 reg_inls
= _mm256_set_epi32(in
[4], in
[3] >> 24 | in
[4] << 8,
1103 in
[3], in
[2] >> 16 | in
[3] << 16,
1104 in
[1] >> 28 | in
[2] << 4, in
[1],
1105 in
[0] >> 20 | in
[1] << 12, in
[0]);
1106 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1107 _mm256_storeu_si256((__m256i
*)(out
), results
);
1110 // shift the second 8 outs
1111 reg_shifts
= _mm256_set_epi32(12, 0, 4, 0,
1113 reg_inls
= _mm256_set_epi32(in
[9], in
[8] >> 24 | in
[9] << 8,
1114 in
[8], in
[7] >> 16 | in
[8] << 16,
1115 in
[6] >> 28 | in
[7] << 4, in
[6],
1116 in
[5] >> 20 | in
[6] << 12, in
[5]);
1117 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1118 _mm256_storeu_si256((__m256i
*)(out
), results
);
1121 // shift the third 8 outs
1122 reg_shifts
= _mm256_set_epi32(12, 0, 4, 0,
1124 reg_inls
= _mm256_set_epi32(in
[14], in
[13] >> 24 | in
[14] << 8,
1125 in
[13], in
[12] >> 16 | in
[13] << 16,
1126 in
[11] >> 28 | in
[12] << 4, in
[11],
1127 in
[10] >> 20 | in
[11] << 12, in
[10]);
1128 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1129 _mm256_storeu_si256((__m256i
*)(out
), results
);
1132 // shift the last 8 outs
1133 reg_shifts
= _mm256_set_epi32(12, 0, 4, 0,
1135 reg_inls
= _mm256_set_epi32(in
[19], in
[18] >> 24 | in
[19] << 8,
1136 in
[18], in
[17] >> 16 | in
[18] << 16,
1137 in
[16] >> 28 | in
[17] << 4, in
[16],
1138 in
[15] >> 20 | in
[16] << 12, in
[15]);
1139 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1140 _mm256_storeu_si256((__m256i
*)(out
), results
);
1148 inline static const uint32_t* unpack21_32_avx2(const uint32_t* in
, uint32_t* out
) {
1149 uint32_t mask
= 0x1fffff;
1150 __m256i reg_shifts
, reg_inls
, reg_masks
;
1153 reg_masks
= _mm256_set1_epi32(mask
);
1155 // shift the first 8 outs
1156 reg_shifts
= _mm256_set_epi32(0, 0, 9, 0,
1158 reg_inls
= _mm256_set_epi32(in
[4] >> 19 | in
[5] << 13, in
[3] >> 30 | in
[4] << 2,
1159 in
[3], in
[2] >> 20 | in
[3] << 12,
1160 in
[1] >> 31 | in
[2] << 1, in
[1],
1161 in
[0] >> 21 | in
[1] << 11, in
[0]);
1162 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1163 _mm256_storeu_si256((__m256i
*)(out
), results
);
1166 // shift the second 8 outs
1167 reg_shifts
= _mm256_set_epi32(0, 6, 0, 0,
1169 reg_inls
= _mm256_set_epi32(in
[9] >> 27 | in
[10] << 5, in
[9],
1170 in
[8] >> 17 | in
[9] << 15, in
[7] >> 28 | in
[8] << 4,
1171 in
[7], in
[6] >> 18 | in
[7] << 14,
1172 in
[5] >> 29 | in
[6] << 3, in
[5]);
1173 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1174 _mm256_storeu_si256((__m256i
*)(out
), results
);
1177 // shift the third 8 outs
1178 reg_shifts
= _mm256_set_epi32(3, 0, 0, 4,
1180 reg_inls
= _mm256_set_epi32(in
[15], in
[14] >> 14 | in
[15] << 18,
1181 in
[13] >> 25 | in
[14] << 7, in
[13],
1182 in
[12] >> 15 | in
[13] << 17, in
[11] >> 26 | in
[12] << 6,
1183 in
[11], in
[10] >> 16 | in
[11] << 16);
1184 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1185 _mm256_storeu_si256((__m256i
*)(out
), results
);
1188 // shift the last 8 outs
1189 reg_shifts
= _mm256_set_epi32(11, 0, 1, 0,
1191 reg_inls
= _mm256_set_epi32(in
[20], in
[19] >> 22 | in
[20] << 10,
1192 in
[19], in
[18] >> 12 | in
[19] << 20,
1193 in
[17] >> 23 | in
[18] << 9, in
[17],
1194 in
[16] >> 13 | in
[17] << 19, in
[15] >> 24 | in
[16] << 8);
1195 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1196 _mm256_storeu_si256((__m256i
*)(out
), results
);
1204 inline static const uint32_t* unpack22_32_avx2(const uint32_t* in
, uint32_t* out
) {
1205 uint32_t mask
= 0x3fffff;
1206 __m256i reg_shifts
, reg_inls
, reg_masks
;
1209 reg_masks
= _mm256_set1_epi32(mask
);
1211 // shift the first 8 outs
1212 reg_shifts
= _mm256_set_epi32(0, 4, 0, 0,
1214 reg_inls
= _mm256_set_epi32(in
[4] >> 26 | in
[5] << 6, in
[4],
1215 in
[3] >> 14 | in
[4] << 18, in
[2] >> 24 | in
[3] << 8,
1216 in
[2], in
[1] >> 12 | in
[2] << 20,
1217 in
[0] >> 22 | in
[1] << 10, in
[0]);
1218 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1219 _mm256_storeu_si256((__m256i
*)(out
), results
);
1222 // shift the second 8 outs
1223 reg_shifts
= _mm256_set_epi32(10, 0, 0, 8,
1225 reg_inls
= _mm256_set_epi32(in
[10], in
[9] >> 20 | in
[10] << 12,
1226 in
[8] >> 30 | in
[9] << 2, in
[8],
1227 in
[7] >> 18 | in
[8] << 14, in
[6] >> 28 | in
[7] << 4,
1228 in
[6], in
[5] >> 16 | in
[6] << 16);
1229 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1230 _mm256_storeu_si256((__m256i
*)(out
), results
);
1233 // shift the third 8 outs
1234 reg_shifts
= _mm256_set_epi32(0, 4, 0, 0,
1236 reg_inls
= _mm256_set_epi32(in
[15] >> 26 | in
[16] << 6, in
[15],
1237 in
[14] >> 14 | in
[15] << 18, in
[13] >> 24 | in
[14] << 8,
1238 in
[13], in
[12] >> 12 | in
[13] << 20,
1239 in
[11] >> 22 | in
[12] << 10, in
[11]);
1240 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1241 _mm256_storeu_si256((__m256i
*)(out
), results
);
1244 // shift the last 8 outs
1245 reg_shifts
= _mm256_set_epi32(10, 0, 0, 8,
1247 reg_inls
= _mm256_set_epi32(in
[21], in
[20] >> 20 | in
[21] << 12,
1248 in
[19] >> 30 | in
[20] << 2, in
[19],
1249 in
[18] >> 18 | in
[19] << 14, in
[17] >> 28 | in
[18] << 4,
1250 in
[17], in
[16] >> 16 | in
[17] << 16);
1251 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1252 _mm256_storeu_si256((__m256i
*)(out
), results
);
1260 inline static const uint32_t* unpack23_32_avx2(const uint32_t* in
, uint32_t* out
) {
1261 uint32_t mask
= 0x7fffff;
1262 __m256i reg_shifts
, reg_inls
, reg_masks
;
1265 reg_masks
= _mm256_set1_epi32(mask
);
1267 // shift the first 8 outs
1268 reg_shifts
= _mm256_set_epi32(1, 0, 0, 0,
1270 reg_inls
= _mm256_set_epi32(in
[5], in
[4] >> 10 | in
[5] << 22,
1271 in
[3] >> 19 | in
[4] << 13, in
[2] >> 28 | in
[3] << 4,
1272 in
[2], in
[1] >> 14 | in
[2] << 18,
1273 in
[0] >> 23 | in
[1] << 9, in
[0]);
1274 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1275 _mm256_storeu_si256((__m256i
*)(out
), results
);
1278 // shift the second 8 outs
1279 reg_shifts
= _mm256_set_epi32(0, 2, 0, 0,
1281 reg_inls
= _mm256_set_epi32(in
[10] >> 25 | in
[11] << 7, in
[10],
1282 in
[9] >> 11 | in
[10] << 21, in
[8] >> 20 | in
[9] << 12,
1283 in
[7] >> 29 | in
[8] << 3, in
[7],
1284 in
[6] >> 15 | in
[7] << 17, in
[5] >> 24 | in
[6] << 8);
1285 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1286 _mm256_storeu_si256((__m256i
*)(out
), results
);
1289 // shift the third 8 outs
1290 reg_shifts
= _mm256_set_epi32(0, 0, 3, 0,
1292 reg_inls
= _mm256_set_epi32(in
[16] >> 17 | in
[17] << 15, in
[15] >> 26 | in
[16] << 6,
1293 in
[15], in
[14] >> 12 | in
[15] << 20,
1294 in
[13] >> 21 | in
[14] << 11, in
[12] >> 30 | in
[13] << 2,
1295 in
[12], in
[11] >> 16 | in
[12] << 16);
1296 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1297 _mm256_storeu_si256((__m256i
*)(out
), results
);
1300 // shift the last 8 outs
1301 reg_shifts
= _mm256_set_epi32(9, 0, 0, 4,
1303 reg_inls
= _mm256_set_epi32(in
[22], in
[21] >> 18 | in
[22] << 14,
1304 in
[20] >> 27 | in
[21] << 5, in
[20],
1305 in
[19] >> 13 | in
[20] << 19, in
[18] >> 22 | in
[19] << 10,
1306 in
[17] >> 31 | in
[18] << 1, in
[17]);
1307 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1308 _mm256_storeu_si256((__m256i
*)(out
), results
);
1316 inline static const uint32_t* unpack24_32_avx2(const uint32_t* in
, uint32_t* out
) {
1317 uint32_t mask
= 0xffffff;
1318 __m256i reg_shifts
, reg_inls
, reg_masks
;
1321 reg_masks
= _mm256_set1_epi32(mask
);
1323 // shift the first 8 outs
1324 reg_shifts
= _mm256_set_epi32(8, 0, 0, 0,
1326 reg_inls
= _mm256_set_epi32(in
[5], in
[4] >> 16 | in
[5] << 16,
1327 in
[3] >> 24 | in
[4] << 8, in
[3],
1328 in
[2], in
[1] >> 16 | in
[2] << 16,
1329 in
[0] >> 24 | in
[1] << 8, in
[0]);
1330 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1331 _mm256_storeu_si256((__m256i
*)(out
), results
);
1334 // shift the second 8 outs
1335 reg_shifts
= _mm256_set_epi32(8, 0, 0, 0,
1337 reg_inls
= _mm256_set_epi32(in
[11], in
[10] >> 16 | in
[11] << 16,
1338 in
[9] >> 24 | in
[10] << 8, in
[9],
1339 in
[8], in
[7] >> 16 | in
[8] << 16,
1340 in
[6] >> 24 | in
[7] << 8, in
[6]);
1341 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1342 _mm256_storeu_si256((__m256i
*)(out
), results
);
1345 // shift the third 8 outs
1346 reg_shifts
= _mm256_set_epi32(8, 0, 0, 0,
1348 reg_inls
= _mm256_set_epi32(in
[17], in
[16] >> 16 | in
[17] << 16,
1349 in
[15] >> 24 | in
[16] << 8, in
[15],
1350 in
[14], in
[13] >> 16 | in
[14] << 16,
1351 in
[12] >> 24 | in
[13] << 8, in
[12]);
1352 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1353 _mm256_storeu_si256((__m256i
*)(out
), results
);
1356 // shift the last 8 outs
1357 reg_shifts
= _mm256_set_epi32(8, 0, 0, 0,
1359 reg_inls
= _mm256_set_epi32(in
[23], in
[22] >> 16 | in
[23] << 16,
1360 in
[21] >> 24 | in
[22] << 8, in
[21],
1361 in
[20], in
[19] >> 16 | in
[20] << 16,
1362 in
[18] >> 24 | in
[19] << 8, in
[18]);
1363 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1364 _mm256_storeu_si256((__m256i
*)(out
), results
);
1372 inline static const uint32_t* unpack25_32_avx2(const uint32_t* in
, uint32_t* out
) {
1373 uint32_t mask
= 0x1ffffff;
1374 __m256i reg_shifts
, reg_inls
, reg_masks
;
1377 reg_masks
= _mm256_set1_epi32(mask
);
1379 // shift the first 8 outs
1380 reg_shifts
= _mm256_set_epi32(0, 0, 0, 4,
1382 reg_inls
= _mm256_set_epi32(in
[5] >> 15 | in
[6] << 17, in
[4] >> 22 | in
[5] << 10,
1383 in
[3] >> 29 | in
[4] << 3, in
[3],
1384 in
[2] >> 11 | in
[3] << 21, in
[1] >> 18 | in
[2] << 14,
1385 in
[0] >> 25 | in
[1] << 7, in
[0]);
1386 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1387 _mm256_storeu_si256((__m256i
*)(out
), results
);
1390 // shift the second 8 outs
1391 reg_shifts
= _mm256_set_epi32(0, 0, 5, 0,
1393 reg_inls
= _mm256_set_epi32(in
[11] >> 23 | in
[12] << 9, in
[10] >> 30 | in
[11] << 2,
1394 in
[10], in
[9] >> 12 | in
[10] << 20,
1395 in
[8] >> 19 | in
[9] << 13, in
[7] >> 26 | in
[8] << 6,
1396 in
[7], in
[6] >> 8 | in
[7] << 24);
1397 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1398 _mm256_storeu_si256((__m256i
*)(out
), results
);
1401 // shift the third 8 outs
1402 reg_shifts
= _mm256_set_epi32(0, 6, 0, 0,
1404 reg_inls
= _mm256_set_epi32(in
[17] >> 31 | in
[18] << 1, in
[17],
1405 in
[16] >> 13 | in
[17] << 19, in
[15] >> 20 | in
[16] << 12,
1406 in
[14] >> 27 | in
[15] << 5, in
[14],
1407 in
[13] >> 9 | in
[14] << 23, in
[12] >> 16 | in
[13] << 16);
1408 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1409 _mm256_storeu_si256((__m256i
*)(out
), results
);
1412 // shift the last 8 outs
1413 reg_shifts
= _mm256_set_epi32(7, 0, 0, 0,
1415 reg_inls
= _mm256_set_epi32(in
[24], in
[23] >> 14 | in
[24] << 18,
1416 in
[22] >> 21 | in
[23] << 11, in
[21] >> 28 | in
[22] << 4,
1417 in
[21], in
[20] >> 10 | in
[21] << 22,
1418 in
[19] >> 17 | in
[20] << 15, in
[18] >> 24 | in
[19] << 8);
1419 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1420 _mm256_storeu_si256((__m256i
*)(out
), results
);
1428 inline static const uint32_t* unpack26_32_avx2(const uint32_t* in
, uint32_t* out
) {
1429 uint32_t mask
= 0x3ffffff;
1430 __m256i reg_shifts
, reg_inls
, reg_masks
;
1433 reg_masks
= _mm256_set1_epi32(mask
);
1435 // shift the first 8 outs
1436 reg_shifts
= _mm256_set_epi32(0, 0, 2, 0,
1438 reg_inls
= _mm256_set_epi32(in
[5] >> 22 | in
[6] << 10, in
[4] >> 28 | in
[5] << 4,
1439 in
[4], in
[3] >> 8 | in
[4] << 24,
1440 in
[2] >> 14 | in
[3] << 18, in
[1] >> 20 | in
[2] << 12,
1441 in
[0] >> 26 | in
[1] << 6, in
[0]);
1442 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1443 _mm256_storeu_si256((__m256i
*)(out
), results
);
1446 // shift the second 8 outs
1447 reg_shifts
= _mm256_set_epi32(6, 0, 0, 0,
1449 reg_inls
= _mm256_set_epi32(in
[12], in
[11] >> 12 | in
[12] << 20,
1450 in
[10] >> 18 | in
[11] << 14, in
[9] >> 24 | in
[10] << 8,
1451 in
[8] >> 30 | in
[9] << 2, in
[8],
1452 in
[7] >> 10 | in
[8] << 22, in
[6] >> 16 | in
[7] << 16);
1453 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1454 _mm256_storeu_si256((__m256i
*)(out
), results
);
1457 // shift the third 8 outs
1458 reg_shifts
= _mm256_set_epi32(0, 0, 2, 0,
1460 reg_inls
= _mm256_set_epi32(in
[18] >> 22 | in
[19] << 10, in
[17] >> 28 | in
[18] << 4,
1461 in
[17], in
[16] >> 8 | in
[17] << 24,
1462 in
[15] >> 14 | in
[16] << 18, in
[14] >> 20 | in
[15] << 12,
1463 in
[13] >> 26 | in
[14] << 6, in
[13]);
1464 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1465 _mm256_storeu_si256((__m256i
*)(out
), results
);
1468 // shift the last 8 outs
1469 reg_shifts
= _mm256_set_epi32(6, 0, 0, 0,
1471 reg_inls
= _mm256_set_epi32(in
[25], in
[24] >> 12 | in
[25] << 20,
1472 in
[23] >> 18 | in
[24] << 14, in
[22] >> 24 | in
[23] << 8,
1473 in
[21] >> 30 | in
[22] << 2, in
[21],
1474 in
[20] >> 10 | in
[21] << 22, in
[19] >> 16 | in
[20] << 16);
1475 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1476 _mm256_storeu_si256((__m256i
*)(out
), results
);
1484 inline static const uint32_t* unpack27_32_avx2(const uint32_t* in
, uint32_t* out
) {
1485 uint32_t mask
= 0x7ffffff;
1486 __m256i reg_shifts
, reg_inls
, reg_masks
;
1489 reg_masks
= _mm256_set1_epi32(mask
);
1491 // shift the first 8 outs
1492 reg_shifts
= _mm256_set_epi32(0, 2, 0, 0,
1494 reg_inls
= _mm256_set_epi32(in
[5] >> 29 | in
[6] << 3, in
[5],
1495 in
[4] >> 7 | in
[5] << 25, in
[3] >> 12 | in
[4] << 20,
1496 in
[2] >> 17 | in
[3] << 15, in
[1] >> 22 | in
[2] << 10,
1497 in
[0] >> 27 | in
[1] << 5, in
[0]);
1498 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1499 _mm256_storeu_si256((__m256i
*)(out
), results
);
1502 // shift the second 8 outs
1503 reg_shifts
= _mm256_set_epi32(0, 0, 0, 4,
1505 reg_inls
= _mm256_set_epi32(in
[12] >> 21 | in
[13] << 11, in
[11] >> 26 | in
[12] << 6,
1506 in
[10] >> 31 | in
[11] << 1, in
[10],
1507 in
[9] >> 9 | in
[10] << 23, in
[8] >> 14 | in
[9] << 18,
1508 in
[7] >> 19 | in
[8] << 13, in
[6] >> 24 | in
[7] << 8);
1509 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1510 _mm256_storeu_si256((__m256i
*)(out
), results
);
1513 // shift the third 8 outs
1514 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1516 reg_inls
= _mm256_set_epi32(in
[19] >> 13 | in
[20] << 19, in
[18] >> 18 | in
[19] << 14,
1517 in
[17] >> 23 | in
[18] << 9, in
[16] >> 28 | in
[17] << 4,
1518 in
[16], in
[15] >> 6 | in
[16] << 26,
1519 in
[14] >> 11 | in
[15] << 21, in
[13] >> 16 | in
[14] << 16);
1520 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1521 _mm256_storeu_si256((__m256i
*)(out
), results
);
1524 // shift the last 8 outs
1525 reg_shifts
= _mm256_set_epi32(5, 0, 0, 0,
1527 reg_inls
= _mm256_set_epi32(in
[26], in
[25] >> 10 | in
[26] << 22,
1528 in
[24] >> 15 | in
[25] << 17, in
[23] >> 20 | in
[24] << 12,
1529 in
[22] >> 25 | in
[23] << 7, in
[21] >> 30 | in
[22] << 2,
1530 in
[21], in
[20] >> 8 | in
[21] << 24);
1531 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1532 _mm256_storeu_si256((__m256i
*)(out
), results
);
1540 inline static const uint32_t* unpack28_32_avx2(const uint32_t* in
, uint32_t* out
) {
1541 uint32_t mask
= 0xfffffff;
1542 __m256i reg_shifts
, reg_inls
, reg_masks
;
1545 reg_masks
= _mm256_set1_epi32(mask
);
1547 // shift the first 8 outs
1548 reg_shifts
= _mm256_set_epi32(4, 0, 0, 0,
1550 reg_inls
= _mm256_set_epi32(in
[6], in
[5] >> 8 | in
[6] << 24,
1551 in
[4] >> 12 | in
[5] << 20, in
[3] >> 16 | in
[4] << 16,
1552 in
[2] >> 20 | in
[3] << 12, in
[1] >> 24 | in
[2] << 8,
1553 in
[0] >> 28 | in
[1] << 4, in
[0]);
1554 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1555 _mm256_storeu_si256((__m256i
*)(out
), results
);
1558 // shift the second 8 outs
1559 reg_shifts
= _mm256_set_epi32(4, 0, 0, 0,
1561 reg_inls
= _mm256_set_epi32(in
[13], in
[12] >> 8 | in
[13] << 24,
1562 in
[11] >> 12 | in
[12] << 20, in
[10] >> 16 | in
[11] << 16,
1563 in
[9] >> 20 | in
[10] << 12, in
[8] >> 24 | in
[9] << 8,
1564 in
[7] >> 28 | in
[8] << 4, in
[7]);
1565 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1566 _mm256_storeu_si256((__m256i
*)(out
), results
);
1569 // shift the third 8 outs
1570 reg_shifts
= _mm256_set_epi32(4, 0, 0, 0,
1572 reg_inls
= _mm256_set_epi32(in
[20], in
[19] >> 8 | in
[20] << 24,
1573 in
[18] >> 12 | in
[19] << 20, in
[17] >> 16 | in
[18] << 16,
1574 in
[16] >> 20 | in
[17] << 12, in
[15] >> 24 | in
[16] << 8,
1575 in
[14] >> 28 | in
[15] << 4, in
[14]);
1576 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1577 _mm256_storeu_si256((__m256i
*)(out
), results
);
1580 // shift the last 8 outs
1581 reg_shifts
= _mm256_set_epi32(4, 0, 0, 0,
1583 reg_inls
= _mm256_set_epi32(in
[27], in
[26] >> 8 | in
[27] << 24,
1584 in
[25] >> 12 | in
[26] << 20, in
[24] >> 16 | in
[25] << 16,
1585 in
[23] >> 20 | in
[24] << 12, in
[22] >> 24 | in
[23] << 8,
1586 in
[21] >> 28 | in
[22] << 4, in
[21]);
1587 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1588 _mm256_storeu_si256((__m256i
*)(out
), results
);
1596 inline static const uint32_t* unpack29_32_avx2(const uint32_t* in
, uint32_t* out
) {
1597 uint32_t mask
= 0x1fffffff;
1598 __m256i reg_shifts
, reg_inls
, reg_masks
;
1601 reg_masks
= _mm256_set1_epi32(mask
);
1603 // shift the first 8 outs
1604 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1606 reg_inls
= _mm256_set_epi32(in
[6] >> 11 | in
[7] << 21, in
[5] >> 14 | in
[6] << 18,
1607 in
[4] >> 17 | in
[5] << 15, in
[3] >> 20 | in
[4] << 12,
1608 in
[2] >> 23 | in
[3] << 9, in
[1] >> 26 | in
[2] << 6,
1609 in
[0] >> 29 | in
[1] << 3, in
[0]);
1610 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1611 _mm256_storeu_si256((__m256i
*)(out
), results
);
1614 // shift the second 8 outs
1615 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1617 reg_inls
= _mm256_set_epi32(in
[13] >> 19 | in
[14] << 13, in
[12] >> 22 | in
[13] << 10,
1618 in
[11] >> 25 | in
[12] << 7, in
[10] >> 28 | in
[11] << 4,
1619 in
[9] >> 31 | in
[10] << 1, in
[9],
1620 in
[8] >> 5 | in
[9] << 27, in
[7] >> 8 | in
[8] << 24);
1621 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1622 _mm256_storeu_si256((__m256i
*)(out
), results
);
1625 // shift the third 8 outs
1626 reg_shifts
= _mm256_set_epi32(0, 0, 1, 0,
1628 reg_inls
= _mm256_set_epi32(in
[20] >> 27 | in
[21] << 5, in
[19] >> 30 | in
[20] << 2,
1629 in
[19], in
[18] >> 4 | in
[19] << 28,
1630 in
[17] >> 7 | in
[18] << 25, in
[16] >> 10 | in
[17] << 22,
1631 in
[15] >> 13 | in
[16] << 19, in
[14] >> 16 | in
[15] << 16);
1632 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1633 _mm256_storeu_si256((__m256i
*)(out
), results
);
1636 // shift the last 8 outs
1637 reg_shifts
= _mm256_set_epi32(3, 0, 0, 0,
1639 reg_inls
= _mm256_set_epi32(in
[28], in
[27] >> 6 | in
[28] << 26,
1640 in
[26] >> 9 | in
[27] << 23, in
[25] >> 12 | in
[26] << 20,
1641 in
[24] >> 15 | in
[25] << 17, in
[23] >> 18 | in
[24] << 14,
1642 in
[22] >> 21 | in
[23] << 11, in
[21] >> 24 | in
[22] << 8);
1643 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1644 _mm256_storeu_si256((__m256i
*)(out
), results
);
1652 inline static const uint32_t* unpack30_32_avx2(const uint32_t* in
, uint32_t* out
) {
1653 uint32_t mask
= 0x3fffffff;
1654 __m256i reg_shifts
, reg_inls
, reg_masks
;
1657 reg_masks
= _mm256_set1_epi32(mask
);
1659 // shift the first 8 outs
1660 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1662 reg_inls
= _mm256_set_epi32(in
[6] >> 18 | in
[7] << 14, in
[5] >> 20 | in
[6] << 12,
1663 in
[4] >> 22 | in
[5] << 10, in
[3] >> 24 | in
[4] << 8,
1664 in
[2] >> 26 | in
[3] << 6, in
[1] >> 28 | in
[2] << 4,
1665 in
[0] >> 30 | in
[1] << 2, in
[0]);
1666 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1667 _mm256_storeu_si256((__m256i
*)(out
), results
);
1670 // shift the second 8 outs
1671 reg_shifts
= _mm256_set_epi32(2, 0, 0, 0,
1673 reg_inls
= _mm256_set_epi32(in
[14], in
[13] >> 4 | in
[14] << 28,
1674 in
[12] >> 6 | in
[13] << 26, in
[11] >> 8 | in
[12] << 24,
1675 in
[10] >> 10 | in
[11] << 22, in
[9] >> 12 | in
[10] << 20,
1676 in
[8] >> 14 | in
[9] << 18, in
[7] >> 16 | in
[8] << 16);
1677 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1678 _mm256_storeu_si256((__m256i
*)(out
), results
);
1681 // shift the third 8 outs
1682 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1684 reg_inls
= _mm256_set_epi32(in
[21] >> 18 | in
[22] << 14, in
[20] >> 20 | in
[21] << 12,
1685 in
[19] >> 22 | in
[20] << 10, in
[18] >> 24 | in
[19] << 8,
1686 in
[17] >> 26 | in
[18] << 6, in
[16] >> 28 | in
[17] << 4,
1687 in
[15] >> 30 | in
[16] << 2, in
[15]);
1688 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1689 _mm256_storeu_si256((__m256i
*)(out
), results
);
1692 // shift the last 8 outs
1693 reg_shifts
= _mm256_set_epi32(2, 0, 0, 0,
1695 reg_inls
= _mm256_set_epi32(in
[29], in
[28] >> 4 | in
[29] << 28,
1696 in
[27] >> 6 | in
[28] << 26, in
[26] >> 8 | in
[27] << 24,
1697 in
[25] >> 10 | in
[26] << 22, in
[24] >> 12 | in
[25] << 20,
1698 in
[23] >> 14 | in
[24] << 18, in
[22] >> 16 | in
[23] << 16);
1699 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1700 _mm256_storeu_si256((__m256i
*)(out
), results
);
1708 inline static const uint32_t* unpack31_32_avx2(const uint32_t* in
, uint32_t* out
) {
1709 uint32_t mask
= 0x7fffffff;
1710 __m256i reg_shifts
, reg_inls
, reg_masks
;
1713 reg_masks
= _mm256_set1_epi32(mask
);
1715 // shift the first 8 outs
1716 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1718 reg_inls
= _mm256_set_epi32(in
[6] >> 25 | in
[7] << 7, in
[5] >> 26 | in
[6] << 6,
1719 in
[4] >> 27 | in
[5] << 5, in
[3] >> 28 | in
[4] << 4,
1720 in
[2] >> 29 | in
[3] << 3, in
[1] >> 30 | in
[2] << 2,
1721 in
[0] >> 31 | in
[1] << 1, in
[0]);
1722 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1723 _mm256_storeu_si256((__m256i
*)(out
), results
);
1726 // shift the second 8 outs
1727 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1729 reg_inls
= _mm256_set_epi32(in
[14] >> 17 | in
[15] << 15, in
[13] >> 18 | in
[14] << 14,
1730 in
[12] >> 19 | in
[13] << 13, in
[11] >> 20 | in
[12] << 12,
1731 in
[10] >> 21 | in
[11] << 11, in
[9] >> 22 | in
[10] << 10,
1732 in
[8] >> 23 | in
[9] << 9, in
[7] >> 24 | in
[8] << 8);
1733 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1734 _mm256_storeu_si256((__m256i
*)(out
), results
);
1737 // shift the third 8 outs
1738 reg_shifts
= _mm256_set_epi32(0, 0, 0, 0,
1740 reg_inls
= _mm256_set_epi32(in
[22] >> 9 | in
[23] << 23, in
[21] >> 10 | in
[22] << 22,
1741 in
[20] >> 11 | in
[21] << 21, in
[19] >> 12 | in
[20] << 20,
1742 in
[18] >> 13 | in
[19] << 19, in
[17] >> 14 | in
[18] << 18,
1743 in
[16] >> 15 | in
[17] << 17, in
[15] >> 16 | in
[16] << 16);
1744 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1745 _mm256_storeu_si256((__m256i
*)(out
), results
);
1748 // shift the last 8 outs
1749 reg_shifts
= _mm256_set_epi32(1, 0, 0, 0,
1751 reg_inls
= _mm256_set_epi32(in
[30], in
[29] >> 2 | in
[30] << 30,
1752 in
[28] >> 3 | in
[29] << 29, in
[27] >> 4 | in
[28] << 28,
1753 in
[26] >> 5 | in
[27] << 27, in
[25] >> 6 | in
[26] << 26,
1754 in
[24] >> 7 | in
[25] << 25, in
[23] >> 8 | in
[24] << 24);
1755 results
= _mm256_and_si256(_mm256_srlv_epi32(reg_inls
, reg_shifts
), reg_masks
);
1756 _mm256_storeu_si256((__m256i
*)(out
), results
);
1764 inline const uint32_t* unpack32_32_avx2(const uint32_t* in
, uint32_t* out
) {
1765 memcpy(out
, in
, 32 * sizeof(*out
));
1772 int unpack32_avx2(const uint32_t* in
, uint32_t* out
, int batch_size
, int num_bits
) {
1773 batch_size
= batch_size
/ 32 * 32;
1774 int num_loops
= batch_size
/ 32;
1778 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack0_32_avx2(in
, out
+ i
* 32);
1781 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack1_32_avx2(in
, out
+ i
* 32);
1784 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack2_32_avx2(in
, out
+ i
* 32);
1787 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack3_32_avx2(in
, out
+ i
* 32);
1790 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack4_32_avx2(in
, out
+ i
* 32);
1793 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack5_32_avx2(in
, out
+ i
* 32);
1796 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack6_32_avx2(in
, out
+ i
* 32);
1799 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack7_32_avx2(in
, out
+ i
* 32);
1802 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack8_32_avx2(in
, out
+ i
* 32);
1805 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack9_32_avx2(in
, out
+ i
* 32);
1808 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack10_32_avx2(in
, out
+ i
* 32);
1811 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack11_32_avx2(in
, out
+ i
* 32);
1814 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack12_32_avx2(in
, out
+ i
* 32);
1817 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack13_32_avx2(in
, out
+ i
* 32);
1820 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack14_32_avx2(in
, out
+ i
* 32);
1823 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack15_32_avx2(in
, out
+ i
* 32);
1826 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack16_32_avx2(in
, out
+ i
* 32);
1829 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack17_32_avx2(in
, out
+ i
* 32);
1832 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack18_32_avx2(in
, out
+ i
* 32);
1835 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack19_32_avx2(in
, out
+ i
* 32);
1838 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack20_32_avx2(in
, out
+ i
* 32);
1841 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack21_32_avx2(in
, out
+ i
* 32);
1844 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack22_32_avx2(in
, out
+ i
* 32);
1847 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack23_32_avx2(in
, out
+ i
* 32);
1850 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack24_32_avx2(in
, out
+ i
* 32);
1853 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack25_32_avx2(in
, out
+ i
* 32);
1856 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack26_32_avx2(in
, out
+ i
* 32);
1859 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack27_32_avx2(in
, out
+ i
* 32);
1862 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack28_32_avx2(in
, out
+ i
* 32);
1865 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack29_32_avx2(in
, out
+ i
* 32);
1868 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack30_32_avx2(in
, out
+ i
* 32);
1871 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack31_32_avx2(in
, out
+ i
* 32);
1874 for (int i
= 0; i
< num_loops
; ++i
) in
= unpack32_32_avx2(in
, out
+ i
* 32);