]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx512 / des_x16_avx512.asm
1 ;;
2 ;; Copyright (c) 2017-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; Authors:
29 ;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2)
30 ;; (1) University of Haifa, Israel
31 ;; (2) Intel Corporation
32
33 ;; In System V AMD64 ABI
34 ;; calle saves: RBX, RBP, R12-R15
35 ;; Windows x64 ABI
36 ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
37
38 ;;
39 ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
40 ;; -----------------------------------------------------------
41 ;; Windows clobbers: RAX R8 R9 R10 R11
42 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
43 ;; -----------------------------------------------------------
44 ;; Linux clobbers: RAX RCX RDX R10 R11
45 ;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
46 ;; -----------------------------------------------------------
47 ;; Clobbers ZMM0-31 and K1 to K7
48
49 %include "os.asm"
50 %include "reg_sizes.asm"
51 %include "mb_mgr_datastruct.asm"
52 %include "constants.asm"
53 ;%define DO_DBGPRINT
54 %include "dbgprint.asm"
55
56 %ifdef LINUX
57 %define arg1 rdi
58 %define arg2 rsi
59 %define arg3 rdx
60 %define arg4 rcx
61 %else
62 %define arg1 rcx
63 %define arg2 rdx
64 %define arg3 r8
65 %define arg4 r9
66 %endif
67
68 %define STATE arg1
69 %define SIZE arg2
70
71 %define OFFSET rax
72
73 %define IA0 arg3
74 %define IA1 arg4
75 %define IA2 r10
76
77 %define INP0 r11
78 %define INP1 r12
79 %define INP2 r13
80 %define INP3 r14
81 %define INP4 r15
82
83 %define KSOFFSET r11
84
85 %define ZW0 zmm0
86 %define ZW1 zmm1
87 %define ZW2 zmm2
88 %define ZW3 zmm3
89 %define ZW4 zmm4
90 %define ZW5 zmm5
91 %define ZW6 zmm6
92 %define ZW7 zmm7
93 %define ZW8 zmm8
94 %define ZW9 zmm9
95 %define ZW10 zmm10
96 %define ZW11 zmm11
97 %define ZW12 zmm12
98 %define ZW13 zmm13
99 %define ZW14 zmm14
100 %define ZW15 zmm15
101
102 %define ZIV0 zmm16
103 %define ZIV1 zmm17
104
105 %define ZTMP0 zmm18
106 %define ZTMP1 zmm19
107 %define ZTMP2 zmm20
108 %define ZTMP3 zmm21
109 %define ZTMP4 zmm22
110 %define ZTMP5 zmm23
111 %define ZTMP6 zmm24
112 %define ZTMP7 zmm25
113 %define ZTMP8 zmm26
114 %define ZTMP9 zmm27
115 %define ZTMP10 zmm28
116 %define ZTMP11 zmm29
117 %define ZTMP12 zmm30
118 %define ZTMP13 zmm31
119
120 struc STACKFRAME
121 _key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
122 _key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
123 _key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
124 _tmp_iv: resq 16 ; 2 x 64 bytes
125 _tmp_in: resq 16 ; 2 x 64 bytes
126 _tmp_out: resq 16 ; 2 x 64 bytes
127 _tmp_mask: resd 16 ; 1 x 64 bytes
128 _gpr_save: resq 4 ; r12 to r15
129 _rsp_save: resq 1
130 _mask_save: resq 1
131 _size_save: resq 1
132 endstruc
133
134 ;;; ===========================================================================
135 ;;; ===========================================================================
136 ;;; MACROS
137 ;;; ===========================================================================
138 ;;; ===========================================================================
139
140 ;;; ===========================================================================
141 ;;; PERMUTE
142 ;;; ===========================================================================
143 ;;; A [in/out] - zmm register
144 ;;; B [in/out] - zmm register
145 ;;; NSHIFT [in] - constant to shift words by
146 ;;; MASK [in] - zmm or m512 with mask
147 ;;; T0 [clobbered] - temporary zmm register
148 %macro PERMUTE 5
149 %define %%A %1
150 %define %%B %2
151 %define %%NSHIFT %3
152 %define %%MASK %4
153 %define %%T0 %5
154
155 vpsrld %%T0, %%A, %%NSHIFT
156 vpxord %%T0, %%T0, %%B
157 vpandd %%T0, %%T0, %%MASK
158 vpxord %%B, %%B, %%T0
159 vpslld %%T0, %%T0, %%NSHIFT
160 vpxord %%A, %%A, %%T0
161 %endmacro
162
163 ;;; ===========================================================================
164 ;;; INITIAL PERMUTATION
165 ;;; ===========================================================================
166 ;;; L [in/out] - zmm register
167 ;;; R [in/out] - zmm register
168 ;;; T0 [clobbered] - temporary zmm register
169 %macro IP_Z 3
170 %define %%L %1
171 %define %%R %2
172 %define %%T0 %3
173 PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0
174 PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0
175 PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0
176 PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0
177 PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0
178 %endmacro
179
180 ;;; ===========================================================================
181 ;;; FINAL PERMUTATION
182 ;;; ===========================================================================
183 ;;; L [in/out] - zmm register
184 ;;; R [in/out] - zmm register
185 ;;; T0 [clobbered] - temporary zmm register
186 %macro FP_Z 3
187 %define %%L %1
188 %define %%R %2
189 %define %%T0 %3
190 PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0
191 PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0
192 PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0
193 PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0
194 PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0
195 %endmacro
196
197 ;;; ===========================================================================
198 ;;; P PHASE
199 ;;; ===========================================================================
200 ;;; W0 [in/out] - zmm register
201 ;;; in: vector of 16 x 32bits from S phase
202 ;;; out: permuted in vector
203 ;;; T0-T3 [clobbered] - temporary zmm register
204 %macro P_PHASE 5
205 %define %%W0 %1
206 %define %%T0 %2
207 %define %%T1 %3
208 %define %%T2 %4
209 %define %%T3 %5
210
211 vprord %%T0, %%W0, 3
212 vpandd %%T0, %%T0, [rel mask_values + 0*64]
213 vprord %%T1, %%W0, 5
214 vpandd %%T1, %%T1, [rel mask_values + 1*64]
215 vpord %%T0, %%T0, %%T1
216
217 vprord %%T1, %%W0, 24
218 vpandd %%T1, %%T1, [rel mask_values + 2*64]
219 vprord %%T2, %%W0, 26
220 vpandd %%T2, %%T2, [rel mask_values + 3*64]
221 vpord %%T1, %%T1, %%T2
222 vpord %%T0, %%T0, %%T1
223
224 vprord %%T1, %%W0, 15
225 vpandd %%T1, %%T1, [rel mask_values + 4*64]
226 vprord %%T2, %%W0, 17
227 vpandd %%T2, %%T2, [rel mask_values + 5*64]
228 vpord %%T1, %%T1, %%T2
229
230 vprord %%T2, %%W0, 6
231 vpandd %%T2, %%T2, [rel mask_values + 6*64]
232 vprord %%T3, %%W0, 21
233 vpandd %%T3, %%T3, [rel mask_values + 7*64]
234 vpord %%T2, %%T2, %%T3
235 vpord %%T1, %%T1, %%T2
236 vpord %%T0, %%T0, %%T1
237
238 vprord %%T1, %%W0, 12
239 vpandd %%T1, %%T1, [rel mask_values + 8*64]
240 vprord %%T2, %%W0, 14
241 vpandd %%T2, %%T2, [rel mask_values + 9*64]
242 vpord %%T1, %%T1, %%T2
243
244 vprord %%T2, %%W0, 4
245 vpandd %%T2, %%T2, [rel mask_values + 10*64]
246 vprord %%T3, %%W0, 11
247 vpandd %%T3, %%T3, [rel mask_values + 11*64]
248 vpord %%T2, %%T2, %%T3
249 vpord %%T1, %%T1, %%T2
250 vpord %%T0, %%T0, %%T1
251
252 vprord %%T1, %%W0, 16
253 vpandd %%T1, %%T1, [rel mask_values + 12*64]
254 vprord %%T2, %%W0, 22
255 vpandd %%T2, %%T2, [rel mask_values + 13*64]
256 vpord %%T1, %%T1, %%T2
257
258 vprord %%T2, %%W0, 19
259 vpandd %%T2, %%T2, [rel mask_values + 14*64]
260 vprord %%T3, %%W0, 10
261 vpandd %%T3, %%T3, [rel mask_values + 15*64]
262 vpord %%T2, %%T2, %%T3
263 vpord %%T1, %%T1, %%T2
264 vpord %%T0, %%T0, %%T1
265
266 vprord %%T1, %%W0, 9
267 vpandd %%T1, %%T1, [rel mask_values + 16*64]
268 vprord %%T2, %%W0, 13
269 vpandd %%T2, %%T2, [rel mask_values + 17*64]
270 vpord %%T1, %%T1, %%T2
271
272 vprord %%T2, %%W0, 25
273 vpandd %%T2, %%T2, [rel mask_values + 18*64]
274 vpord %%T1, %%T1, %%T2
275 vpord %%W0, %%T0, %%T1
276 %endmacro
277
278 ;;; ===========================================================================
279 ;;; E PHASE
280 ;;; ===========================================================================
281 ;;;
282 ;;; Expands 16x32-bit words into 16x48-bit words
283 ;;; plus XOR's result with the key schedule.
284 ;;; The output is adjusted to be friendly as S phase input.
285 ;;;
286 ;;; in [in] - zmm register
287 ;;; out0a [out] - zmm register
288 ;;; out0b [out] - zmm register
289 ;;; out1a [out] - zmm register
290 ;;; out1b [out] - zmm register
291 ;;; k0 [in] - key schedule; zmm or m512
292 ;;; k1 [in] - key schedule; zmm or m512
293 ;;; t0-t1 [clobbered] - temporary zmm register
294 %macro E_PHASE 9
295 %define %%IN %1
296 %define %%OUT0A %2
297 %define %%OUT0B %3
298 %define %%OUT1A %4
299 %define %%OUT1B %5
300 %define %%K0 %6
301 %define %%K1 %7
302 %define %%T0 %8
303 %define %%T1 %9
304
305 vprord %%T0, %%IN, 31
306 vprord %%T1, %%IN, 3
307 vpshufb %%T0, %%T0, [rel idx_e]
308 vpshufb %%T1, %%T1, [rel idx_e]
309 vpunpcklbw %%OUT0A, %%T0, %%T1
310 vpunpckhbw %%OUT1A, %%T0, %%T1
311 vpxord %%OUT0A, %%OUT0A, %%K0
312 vpxord %%OUT1A, %%OUT1A, %%K1
313 vpandd %%OUT0B, %%OUT0A, [rel and_eu]
314 vpsrlw %%OUT0B, %%OUT0B, 8
315 vpandd %%OUT0A, %%OUT0A, [rel and_ed]
316 vpandd %%OUT1B, %%OUT1A, [rel and_eu]
317 vpsrlw %%OUT1B, %%OUT1B, 8
318 vpandd %%OUT1A, %%OUT1A, [rel and_ed]
319 %endmacro
320
321 ;;; ===========================================================================
322 ;;; S-BOX
323 ;;; ===========================================================================
324 ;;;
325 ;;; NOTE: clobbers k1-k6 OpMask registers
326 ;;;
327 ;;; IN0A [in] - zmm register; output from E-phase
328 ;;; IN0B [in] - zmm register; output from E-phase
329 ;;; IN1A [in] - zmm register; output from E-phase
330 ;;; IN1B [in] - zmm register; output from E-phase
331 ;;; OUT [out] - zmm register; output from E-phase
332 ;;; T0-T5 [clobbered] - temporary zmm register
333 %macro S_PHASE 11
334 %define %%IN0A %1
335 %define %%IN0B %2
336 %define %%IN1A %3
337 %define %%IN1B %4
338 %define %%OUT %5
339 %define %%T0 %6
340 %define %%T1 %7
341 %define %%T2 %8
342 %define %%T3 %9
343 %define %%T4 %10
344 %define %%T5 %11
345
346 vmovdqa64 %%T0, [rel reg_values16bit_7]
347 vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE
348 vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE
349 vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE
350 vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE
351
352 mov DWORD(IA0), 0x55555555
353 kmovd k1, DWORD(IA0)
354 mov DWORD(IA0), 0xaaaaaaaa
355 kmovd k2, DWORD(IA0)
356
357 vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64]
358 vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64]
359 vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64]
360 vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64]
361 vpxord %%T0, %%T0, %%T2
362 vpxord %%OUT, %%T1, %%T3
363 vmovdqu16 %%OUT{k3}, %%T0
364
365 vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64]
366 vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64]
367 vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64]
368 vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64]
369 vpxord %%T0, %%T0, %%T2
370 vpxord %%T3, %%T1, %%T3
371 vmovdqu16 %%T3{k4}, %%T0
372 vpsllw %%T3, %%T3, 4
373 vpxord %%OUT, %%OUT, %%T3
374
375 vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64]
376 vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64]
377 vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64]
378 vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64]
379 vpxord %%T0, %%T0, %%T2
380 vpxord %%T4, %%T1, %%T3
381 vmovdqu16 %%T4{k5}, %%T0
382
383 vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64]
384 vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64]
385 vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64]
386 vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64]
387 vpxord %%T0, %%T0, %%T2
388 vpxord %%T5, %%T1, %%T3
389 vmovdqu16 %%T5{k6}, %%T0
390 vpsllw %%T5, %%T5, 4
391
392 vpxord %%T4, %%T4, %%T5
393 vpsllw %%T4, %%T4, 8
394 vpxord %%OUT, %%OUT, %%T4
395 vpshufb %%OUT, %%OUT, [rel shuffle_reg]
396 %endmacro
397
398 ;;; ===========================================================================
399 ;;; DES encryption/decryption round
400 ;;; ===========================================================================
401 ;;;
402 ;;; Clobbers k1-k6 OpMask registers
403 ;;;
404 ;;; ENC_DEC [in] - ENC for encryption, DEC for decryption
405 ;;; R [in/out] - zmm register; plain text in & cipher text out
406 ;;; L [in/out] - zmm register; plain text in & cipher text out
407 ;;; KS [in] - pointer to the key schedule
408 ;;; T0-T11 [clobbered] - temporary zmm register
409 %macro DES_ENC_DEC 16
410 %define %%ENC_DEC %1
411 %define %%R %2
412 %define %%L %3
413 %define %%KS %4
414 %define %%T0 %5
415 %define %%T1 %6
416 %define %%T2 %7
417 %define %%T3 %8
418 %define %%T4 %9
419 %define %%T5 %10
420 %define %%T6 %11
421 %define %%T7 %12
422 %define %%T8 %13
423 %define %%T9 %14
424 %define %%T10 %15
425 %define %%T11 %16
426
427 IP_Z %%R, %%L, %%T0
428
429 %ifidn %%ENC_DEC, ENC
430 ;; ENCRYPTION
431 xor KSOFFSET, KSOFFSET
432 %%_des_enc_loop:
433 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7
434 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
435 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
436 vpxord %%L, %%L, %%T0
437
438 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7
439 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
440 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
441 vpxord %%R, %%R, %%T0
442
443 add KSOFFSET, (4*64)
444 cmp KSOFFSET, (8*(4*64))
445 jb %%_des_enc_loop
446
447 %else
448 ;; DECRYPTION
449 mov KSOFFSET, (8*(4*64))
450 %%_des_dec_loop:
451 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7
452 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
453 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
454 vpxord %%L, %%L, %%T0
455
456 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7
457 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
458 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
459 vpxord %%R, %%R, %%T0
460 sub KSOFFSET, (4*64)
461 jnz %%_des_dec_loop
462 %endif ; DECRYPTION
463
464 FP_Z %%R, %%L, %%T0
465 %endmacro
466
467 ;;; ===========================================================================
468 ;;; DATA TRANSPOSITION AT DATA INPUT
469 ;;; ===========================================================================
470 ;;;
471 ;;; IN00 - IN15 [in/out]:
472 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
473 ;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
474 ;;; T0-T3 [clobbered] - temporary zmm registers
475 ;;; K0-K5 [clobbered] - temporary zmm registers
476 ;;; H0-H3 [clobbered] - temporary zmm registers
477 %macro TRANSPOSE_IN 30
478 %define %%IN00 %1 ; R0
479 %define %%IN01 %2 ; L0
480 %define %%IN02 %3 ; R1
481 %define %%IN03 %4 ; L1
482 %define %%IN04 %5 ; R2
483 %define %%IN05 %6 ; L2
484 %define %%IN06 %7 ; R3
485 %define %%IN07 %8 ; L3
486 %define %%IN08 %9 ; R4
487 %define %%IN09 %10 ; L4
488 %define %%IN10 %11 ; R5
489 %define %%IN11 %12 ; L5
490 %define %%IN12 %13 ; R6
491 %define %%IN13 %14 ; L6
492 %define %%IN14 %15 ; R7
493 %define %%IN15 %16 ; L7
494 %define %%T0 %17
495 %define %%T1 %18
496 %define %%T2 %19
497 %define %%T3 %20
498 %define %%K0 %21
499 %define %%K1 %22
500 %define %%K2 %23
501 %define %%K3 %24
502 %define %%K4 %25
503 %define %%K5 %26
504 %define %%H0 %27
505 %define %%H1 %28
506 %define %%H2 %29
507 %define %%H3 %30
508
509 vpunpckldq %%K0, %%IN00, %%IN01
510 vpunpckhdq %%K1, %%IN00, %%IN01
511 vpunpckldq %%T0, %%IN02, %%IN03
512 vpunpckhdq %%T1, %%IN02, %%IN03
513
514 vpunpckldq %%IN00, %%IN04, %%IN05
515 vpunpckhdq %%IN01, %%IN04, %%IN05
516 vpunpckldq %%IN02, %%IN06, %%IN07
517 vpunpckhdq %%IN03, %%IN06, %%IN07
518
519 vpunpcklqdq %%K2, %%K0, %%T0
520 vpunpckhqdq %%T2, %%K0, %%T0
521 vpunpcklqdq %%K3, %%K1, %%T1
522 vpunpckhqdq %%T3, %%K1, %%T1
523
524 vpunpcklqdq %%K0, %%IN00, %%IN02
525 vpunpckhqdq %%K1, %%IN00, %%IN02
526 vpunpcklqdq %%T0, %%IN01, %%IN03
527 vpunpckhqdq %%T1, %%IN01, %%IN03
528
529 vpunpckldq %%K4, %%IN08, %%IN09
530 vpunpckhdq %%K5, %%IN08, %%IN09
531 vpunpckldq %%IN04, %%IN10, %%IN11
532 vpunpckhdq %%IN05, %%IN10, %%IN11
533 vpunpckldq %%IN06, %%IN12, %%IN13
534 vpunpckhdq %%IN07, %%IN12, %%IN13
535 vpunpckldq %%IN10, %%IN14, %%IN15
536 vpunpckhdq %%IN11, %%IN14, %%IN15
537
538 vpunpcklqdq %%IN12, %%K4, %%IN04
539 vpunpckhqdq %%IN13, %%K4, %%IN04
540 vpunpcklqdq %%IN14, %%K5, %%IN05
541 vpunpckhqdq %%IN15, %%K5, %%IN05
542 vpunpcklqdq %%IN00, %%IN06, %%IN10
543 vpunpckhqdq %%IN01, %%IN06, %%IN10
544 vpunpcklqdq %%IN02, %%IN07, %%IN11
545 vpunpckhqdq %%IN03, %%IN07, %%IN11
546
547 vshufi64x2 %%H0, %%K2, %%K0, 0x44
548 vshufi64x2 %%H1, %%K2, %%K0, 0xee
549 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
550 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
551 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
552 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
553 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
554 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
555
556 vshufi64x2 %%H0, %%T2, %%K1, 0x44
557 vshufi64x2 %%H1, %%T2, %%K1, 0xee
558 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
559 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
560 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
561 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
562 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
563 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
564
565 vshufi64x2 %%H0, %%K3, %%T0, 0x44
566 vshufi64x2 %%H1, %%K3, %%T0, 0xee
567 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
568 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
569 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
570 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
571 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
572 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
573
574 vshufi64x2 %%H0, %%T3, %%T1, 0x44
575 vshufi64x2 %%H1, %%T3, %%T1, 0xee
576 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
577 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
578 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
579 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
580 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
581 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
582 %endmacro
583
584 ;;; ===========================================================================
585 ;;; DATA TRANSPOSITION AT DATA OUTPUT
586 ;;; ===========================================================================
587 ;;;
588 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
589 ;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
590 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
591 ;;; T0-T3 [clobbered] - temporary zmm registers
592 ;;; K0-K5 [clobbered] - temporary zmm registers
593 ;;; H0-H3 [clobbered] - temporary zmm registers
594 %macro TRANSPOSE_OUT 30
595 %define %%IN00 %1 ; R0
596 %define %%IN01 %2 ; L0
597 %define %%IN02 %3 ; R1
598 %define %%IN03 %4 ; L1
599 %define %%IN04 %5 ; R2
600 %define %%IN05 %6 ; L2
601 %define %%IN06 %7 ; R3
602 %define %%IN07 %8 ; L3
603 %define %%IN08 %9 ; R4
604 %define %%IN09 %10 ; L4
605 %define %%IN10 %11 ; R5
606 %define %%IN11 %12 ; L5
607 %define %%IN12 %13 ; R6
608 %define %%IN13 %14 ; L6
609 %define %%IN14 %15 ; R7
610 %define %%IN15 %16 ; L7
611 %define %%T0 %17
612 %define %%T1 %18
613 %define %%T2 %19
614 %define %%T3 %20
615 %define %%K0 %21
616 %define %%K1 %22
617 %define %%K2 %23
618 %define %%K3 %24
619 %define %%K4 %25
620 %define %%K5 %26
621 %define %%H0 %27
622 %define %%H1 %28
623 %define %%H2 %29
624 %define %%H3 %30
625
626 vpunpckldq %%K0, %%IN01, %%IN00
627 vpunpckhdq %%K1, %%IN01, %%IN00
628 vpunpckldq %%T0, %%IN03, %%IN02
629 vpunpckhdq %%T1, %%IN03, %%IN02
630
631 vpunpckldq %%IN00, %%IN05, %%IN04
632 vpunpckhdq %%IN01, %%IN05, %%IN04
633 vpunpckldq %%IN02, %%IN07, %%IN06
634 vpunpckhdq %%IN03, %%IN07, %%IN06
635
636 vpunpcklqdq %%K2, %%K0, %%T0
637 vpunpckhqdq %%T2, %%K0, %%T0
638 vpunpcklqdq %%K3, %%K1, %%T1
639 vpunpckhqdq %%T3, %%K1, %%T1
640
641 vpunpcklqdq %%K0, %%IN00, %%IN02
642 vpunpckhqdq %%K1, %%IN00, %%IN02
643 vpunpcklqdq %%T0, %%IN01, %%IN03
644 vpunpckhqdq %%T1, %%IN01, %%IN03
645
646 vpunpckldq %%K4, %%IN09, %%IN08
647 vpunpckhdq %%K5, %%IN09, %%IN08
648 vpunpckldq %%IN04, %%IN11, %%IN10
649 vpunpckhdq %%IN05, %%IN11, %%IN10
650 vpunpckldq %%IN06, %%IN13, %%IN12
651 vpunpckhdq %%IN07, %%IN13, %%IN12
652 vpunpckldq %%IN10, %%IN15, %%IN14
653 vpunpckhdq %%IN11, %%IN15, %%IN14
654
655 vpunpcklqdq %%IN12, %%K4, %%IN04
656 vpunpckhqdq %%IN13, %%K4, %%IN04
657 vpunpcklqdq %%IN14, %%K5, %%IN05
658 vpunpckhqdq %%IN15, %%K5, %%IN05
659 vpunpcklqdq %%IN00, %%IN06, %%IN10
660 vpunpckhqdq %%IN01, %%IN06, %%IN10
661 vpunpcklqdq %%IN02, %%IN07, %%IN11
662 vpunpckhqdq %%IN03, %%IN07, %%IN11
663
664 vshufi64x2 %%H0, %%K2, %%K0, 0x44
665 vshufi64x2 %%H1, %%K2, %%K0, 0xee
666 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
667 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
668 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
669 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
670 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
671 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
672
673 vshufi64x2 %%H0, %%T2, %%K1, 0x44
674 vshufi64x2 %%H1, %%T2, %%K1, 0xee
675 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
676 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
677 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
678 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
679 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
680 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
681
682 vshufi64x2 %%H0, %%K3, %%T0, 0x44
683 vshufi64x2 %%H1, %%K3, %%T0, 0xee
684 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
685 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
686 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
687 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
688 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
689 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
690
691 vshufi64x2 %%H0, %%T3, %%T1, 0x44
692 vshufi64x2 %%H1, %%T3, %%T1, 0xee
693 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
694 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
695 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
696 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
697 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
698 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
699 %endmacro
700
701 ;;; ===========================================================================
702 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT
703 ;;; ===========================================================================
704 ;;;
705 ;;; IN00-IN15 / R0/L0-R7/L7 [in/out]:
706 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
707 ;;; out: R0 - 16 x word0, L0 - 16 x word1
708 ;;; T0,T2 [clobbered] - temporary zmm registers
709 ;;; K0-K4 [clobbered] - temporary zmm registers
710 ;;; H0,H2 [clobbered] - temporary zmm registers
711 %macro TRANSPOSE_IN_ONE 24
712 %define %%IN00 %1 ; R0
713 %define %%IN01 %2 ; L0
714 %define %%IN02 %3 ; R1
715 %define %%IN03 %4 ; L1
716 %define %%IN04 %5 ; R2
717 %define %%IN05 %6 ; L2
718 %define %%IN06 %7 ; R3
719 %define %%IN07 %8 ; L3
720 %define %%IN08 %9 ; R4
721 %define %%IN09 %10 ; L4
722 %define %%IN10 %11 ; R5
723 %define %%IN11 %12 ; L5
724 %define %%IN12 %13 ; R6
725 %define %%IN13 %14 ; L6
726 %define %%IN14 %15 ; R7
727 %define %%IN15 %16 ; L7
728 %define %%T0 %17
729 %define %%T2 %18
730 %define %%K0 %19
731 %define %%K1 %20
732 %define %%K2 %21
733 %define %%K4 %22
734 %define %%H0 %23
735 %define %%H2 %24
736
737 vpunpckldq %%K0, %%IN00, %%IN01
738 vpunpckhdq %%K1, %%IN00, %%IN01
739 vpunpckldq %%T0, %%IN02, %%IN03
740
741 vpunpckldq %%IN00, %%IN04, %%IN05
742 vpunpckhdq %%IN01, %%IN04, %%IN05
743 vpunpckldq %%IN02, %%IN06, %%IN07
744
745 vpunpcklqdq %%K2, %%K0, %%T0
746 vpunpckhqdq %%T2, %%K0, %%T0
747
748 vpunpcklqdq %%K0, %%IN00, %%IN02
749 vpunpckhqdq %%K1, %%IN00, %%IN02
750
751 vpunpckldq %%K4, %%IN08, %%IN09
752 vpunpckldq %%IN04, %%IN10, %%IN11
753 vpunpckldq %%IN06, %%IN12, %%IN13
754 vpunpckldq %%IN10, %%IN14, %%IN15
755
756 vpunpcklqdq %%IN12, %%K4, %%IN04
757 vpunpckhqdq %%IN13, %%K4, %%IN04
758 vpunpcklqdq %%IN00, %%IN06, %%IN10
759 vpunpckhqdq %%IN01, %%IN06, %%IN10
760
761 vshufi64x2 %%H0, %%K2, %%K0, 0x44
762 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
763 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
764
765 vshufi64x2 %%H0, %%T2, %%K1, 0x44
766 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
767 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
768 %endmacro
769
770 ;;; ===========================================================================
771 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT
772 ;;; ===========================================================================
773 ;;;
774 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
775 ;;; in: R0 - 16 x word0, L0 - 16 x word1
776 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
777 ;;; T0-T3 [clobbered] - temporary zmm registers
778 ;;; K0-K3 [clobbered] - temporary zmm registers
779 ;;; H0,H1 [clobbered] - temporary zmm registers
780 %macro TRANSPOSE_OUT_ONE 25
781 %define %%IN00 %1 ; R0
782 %define %%IN01 %2 ; L0
783 %define %%IN02 %3 ; R1
784 %define %%IN03 %4 ; L1
785 %define %%IN04 %5 ; R2
786 %define %%IN05 %6 ; L2
787 %define %%IN06 %7 ; R3
788 %define %%IN07 %8 ; L3
789 %define %%IN08 %9 ; R4
790 %define %%IN09 %10 ; L4
791 %define %%IN10 %11 ; R5
792 %define %%IN11 %12 ; L5
793 %define %%IN12 %13 ; R6
794 %define %%IN13 %14 ; L6
795 %define %%IN14 %15 ; R7
796 %define %%IN15 %16 ; L7
797 %define %%T0 %17
798 %define %%T2 %18
799 %define %%T3 %19
800 %define %%K0 %20
801 %define %%K1 %21
802 %define %%K2 %22
803 %define %%K3 %23
804 %define %%H0 %24
805 %define %%H1 %25
806
807 vpxord %%T0, %%T0, %%T0
808
809 vpunpckldq %%K0, %%IN01, %%IN00
810 vpunpckhdq %%K1, %%IN01, %%IN00
811
812 vpunpcklqdq %%K2, %%K0, %%T0
813 vpunpckhqdq %%T2, %%K0, %%T0
814 vpunpcklqdq %%K3, %%K1, %%T0
815 vpunpckhqdq %%T3, %%K1, %%T0
816
817 vshufi64x2 %%H0, %%K2, %%T0, 0x44
818 vshufi64x2 %%H1, %%K2, %%T0, 0xee
819 vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0
820 vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2
821 vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4
822 vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6
823
824 vshufi64x2 %%H0, %%T2, %%T0, 0x44
825 vshufi64x2 %%H1, %%T2, %%T0, 0xee
826 vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0
827 vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2
828 vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4
829 vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6
830
831 vshufi64x2 %%H0, %%K3, %%T0, 0x44
832 vshufi64x2 %%H1, %%K3, %%T0, 0xee
833 vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1
834 vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3
835 vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5
836 vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7
837
838 vshufi64x2 %%H0, %%T3, %%T0, 0x44
839 vshufi64x2 %%H1, %%T3, %%T0, 0xee
840 vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1
841 vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3
842 vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5
843 vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7
844 %endmacro
845
846 ;;; ===========================================================================
847 ;;; DES INITIALIZATION
848 ;;; key schedule transposition and IV set up
849 ;;; ===========================================================================
850 ;;;
851 ;;; STATE_KEYS [in] - KEYS in DES OOO STATE
852 ;;; STATE_IV [ in] - IV in DES OOO STATE
853 ;;; KS [out] - place to store transposed key schedule or NULL
854 ;;; IV0 [out] - r512; initialization vector
855 ;;; IV1 [out] - r512; initialization vector
856 ;;; T0-T27 [clobbered] - temporary r512
857 %macro DES_INIT 33
858 %define %%STATE_KEYS %1
859 %define %%STATE_IV %2
860 %define %%KS %3
861 %define %%IV0 %4
862 %define %%IV1 %5
863 %define %%T0 %6
864 %define %%T1 %7
865 %define %%T2 %8
866 %define %%T3 %9
867 %define %%T4 %10
868 %define %%T5 %11
869 %define %%T6 %12
870 %define %%T7 %13
871 %define %%T8 %14
872 %define %%T9 %15
873 %define %%T10 %16
874 %define %%T11 %17
875 %define %%T12 %18
876 %define %%T13 %19
877 %define %%T14 %20
878 %define %%T15 %21
879 %define %%T16 %22
880 %define %%T17 %23
881 %define %%T18 %24
882 %define %%T19 %25
883 %define %%T20 %26
884 %define %%T21 %27
885 %define %%T22 %28
886 %define %%T23 %29
887 %define %%T24 %30
888 %define %%T25 %31
889 %define %%T26 %32
890 %define %%T27 %33
891
892 ;; set up the key schedule
893 ;; - load first half of the keys & transpose
894 ;; - transpose and store
895 ;; note: we can use IV registers as temprary ones here
896 %assign IDX 0
897 %rep 16
898 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
899 vmovdqu64 %%T %+ IDX, [IA0]
900 %assign IDX (IDX + 1)
901 %endrep
902 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
903 %assign IDX 0
904 %rep 16
905 vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX
906 %assign IDX (IDX + 1)
907 %endrep
908 ;; - load second half of the keys & transpose
909 ;; - transpose and store
910 ;; note: we can use IV registers as temprary ones here
911 %assign IDX 0
912 %rep 16
913 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
914 vmovdqu64 %%T %+ IDX, [IA0 + 64]
915 %assign IDX (IDX + 1)
916 %endrep
917 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
918 %assign IDX 0
919 %rep 16
920 vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX
921 %assign IDX (IDX + 1)
922 %endrep
923
924 ;; set up IV
925 ;; - they are already kept transposed so this is enough to load them
926 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
927 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
928 %endmacro
929
930 ;;; ===========================================================================
931 ;;; 3DES INITIALIZATION
932 ;;; key schedule transposition and IV set up
933 ;;; ===========================================================================
934 ;;;
935 ;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE
936 ;;; STATE_IV [ in] - IV in 3DES OOO STATE
937 ;;; KS1 [out] - place to store transposed key schedule or NULL
938 ;;; KS2 [out] - place to store transposed key schedule or NULL
939 ;;; KS3 [out] - place to store transposed key schedule or NULL
940 ;;; IV0 [out] - r512; initialization vector
941 ;;; IV1 [out] - r512; initialization vector
942 ;;; T0-T27 [clobbered] - temporary r512
943 ;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec)
944 %macro DES3_INIT 36
945 %define %%STATE_KEYS %1
946 %define %%STATE_IV %2
947 %define %%KS1 %3
948 %define %%KS2 %4
949 %define %%KS3 %5
950 %define %%IV0 %6
951 %define %%IV1 %7
952 %define %%T0 %8
953 %define %%T1 %9
954 %define %%T2 %10
955 %define %%T3 %11
956 %define %%T4 %12
957 %define %%T5 %13
958 %define %%T6 %14
959 %define %%T7 %15
960 %define %%T8 %16
961 %define %%T9 %17
962 %define %%T10 %18
963 %define %%T11 %19
964 %define %%T12 %20
965 %define %%T13 %21
966 %define %%T14 %22
967 %define %%T15 %23
968 %define %%T16 %24
969 %define %%T17 %25
970 %define %%T18 %26
971 %define %%T19 %27
972 %define %%T20 %28
973 %define %%T21 %29
974 %define %%T22 %30
975 %define %%T23 %31
976 %define %%T24 %32
977 %define %%T25 %33
978 %define %%T26 %34
979 %define %%T27 %35
980 %define %%DIR %36
981
982 %ifidn %%DIR, ENC
983 %assign KEY_IDX 0
984 %else
985 %assign KEY_IDX 2
986 %endif
987 %assign KS_IDX 1
988
989 %rep 3
990 ;; set up the key schedule
991 ;; - load first half of the keys & transpose
992 ;; - transpose and store
993 ;; note: we can use IV registers as temprary ones here
994
995 %assign IDX 0
996 %rep 16
997 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
998 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
999 vmovdqu64 %%T %+ IDX, [IA0]
1000 %assign IDX (IDX + 1)
1001 %endrep
1002 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1003 %assign IDX 0
1004 %rep 16
1005 vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX
1006 %assign IDX (IDX + 1)
1007 %endrep
1008 ;; - load second half of the keys & transpose
1009 ;; - transpose and store
1010 ;; note: we can use IV registers as temprary ones here
1011 %assign IDX 0
1012 %rep 16
1013 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
1014 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
1015 vmovdqu64 %%T %+ IDX, [IA0 + 64]
1016 %assign IDX (IDX + 1)
1017 %endrep
1018 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1019 %assign IDX 0
1020 %rep 16
1021 vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX
1022 %assign IDX (IDX + 1)
1023 %endrep
1024
1025 %ifidn %%DIR, ENC
1026 %assign KEY_IDX (KEY_IDX + 1)
1027 %else
1028 %assign KEY_IDX (KEY_IDX - 1)
1029 %endif
1030 %assign KS_IDX (KS_IDX + 1)
1031 %endrep ; KEY_IDX / KS_IDX
1032
1033 ;; set up IV
1034 ;; - they are already kept transposed so this is enough to load them
1035 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
1036 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
1037
1038 %endmacro
1039
1040 ;;; ===========================================================================
1041 ;;; DES FINISH
1042 ;;; Update in/out pointers and store IV
1043 ;;; ===========================================================================
1044 ;;;
1045 ;;; Needs: STATE & SIZE
1046 ;;; IV0 [in] - r512; initialization vector
1047 ;;; IV1 [in] - r512; initialization vector
1048 ;;; T0-T4 [clobbered] - temporary r512 registers
1049 %macro DES_FINISH 7
1050 %define %%IV0 %1
1051 %define %%IV1 %2
1052 %define %%T0 %3
1053 %define %%T1 %4
1054 %define %%T2 %5
1055 %define %%T3 %6
1056 %define %%T4 %7
1057
1058 vpbroadcastq %%T4, SIZE
1059 vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)]
1060 vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)]
1061 vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)]
1062 vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)]
1063 vpaddq %%T0, %%T0, %%T4
1064 vpaddq %%T1, %%T1, %%T4
1065 vpaddq %%T2, %%T2, %%T4
1066 vpaddq %%T3, %%T3, %%T4
1067 vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0
1068 vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1
1069 vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2
1070 vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3
1071
1072 vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0
1073 vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1
1074 %endmacro
1075
1076 ;;; ===========================================================================
1077 ;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY
1078 ;;; ===========================================================================
1079 ;;;
1080 ;;; Needs: STATE, IA0-IA2
1081 ;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection
1082 ;;; KS [in] - key schedule
1083 ;;; T0-T24 [clobbered] - temporary r512
1084 ;;; T_IN [in] - 16 * 8 byte storage
1085 ;;; T_OUT [in] - 16 * 8 byte storage
1086 ;;; T_MASK [in] - 16 * 4 byte storage
1087 ;;; T_IV [in] - 16 * 8 byte storage
1088 ;;;
1089 ;;; NOTE: clobbers OpMask registers
1090 %macro DES_CFB_ONE 31
1091 %define %%ENC_DEC %1
1092 %define %%KS %2
1093 %define %%T0 %3
1094 %define %%T1 %4
1095 %define %%T2 %5
1096 %define %%T3 %6
1097 %define %%T4 %7
1098 %define %%T5 %8
1099 %define %%T6 %9
1100 %define %%T7 %10
1101 %define %%T8 %11
1102 %define %%T9 %12
1103 %define %%T10 %13
1104 %define %%T11 %14
1105 %define %%T12 %15
1106 %define %%T13 %16
1107 %define %%T14 %17
1108 %define %%T15 %18
1109 %define %%T16 %19
1110 %define %%T17 %20
1111 %define %%T18 %21
1112 %define %%T19 %22
1113 %define %%T20 %23
1114 %define %%T21 %24
1115 %define %%T22 %25
1116 %define %%T23 %26
1117 %define %%T24 %27
1118 %define %%T_IN %28
1119 %define %%T_OUT %29
1120 %define %%T_IV %30
1121 %define %%T_MASK %31
1122
1123 ;; - find mask for non-zero partial lengths
1124 vpxord %%T10, %%T10, %%T10
1125 vmovdqu64 %%T0, [STATE + _des_args_PLen]
1126 vpcmpd k3, %%T0, %%T10, 4 ; NEQ
1127 kmovw DWORD(IA0), k3
1128 movzx DWORD(IA0), WORD(IA0)
1129 or DWORD(IA0), DWORD(IA0)
1130 jz %%_des_cfb_one_end ; no non-zero partial lengths
1131
1132 %ifidn %%ENC_DEC, ENC
1133 ;; For encyrption case we need to make sure that
1134 ;; all full blocks are complete before proceeding
1135 ;; with CFB partial block.
1136 ;; To do that current out position is compared against
1137 ;; calculated last full block position.
1138 vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)]
1139 vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)]
1140 vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)]
1141 vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)]
1142 vpcmpq k4, %%T1, %%T2, 0 ; EQ
1143 vpcmpq k5, %%T3, %%T4, 0 ; EQ
1144 kmovw DWORD(IA1), k4
1145 movzx DWORD(IA1), BYTE(IA1)
1146 kmovw DWORD(IA2), k5
1147 movzx DWORD(IA2), BYTE(IA2)
1148 shl DWORD(IA2), 8
1149 or DWORD(IA2), DWORD(IA1)
1150 and DWORD(IA0), DWORD(IA2)
1151 jz %%_des_cfb_one_end ; no non-zero lengths left
1152 kmovw k3, DWORD(IA0)
1153 %endif
1154 ;; Calculate ((1 << partial_bytes) - 1)
1155 ;; in order to get the mask for loads and stores
1156 ;; k3 & IA0 - hold valid mask
1157 vmovdqa64 %%T1, [rel vec_ones_32b]
1158 vpsllvd %%T2{k3}{z}, %%T1, %%T0
1159 vpsubd %%T2{k3}{z}, %%T2, %%T1
1160 vmovdqu64 [%%T_MASK], %%T2
1161
1162 ;; clear selected partial lens not to do them twice
1163 vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10
1164
1165 ;; copy IV, in and out pointers
1166 vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)]
1167 vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)]
1168 vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)]
1169 vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)]
1170 vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)]
1171 vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)]
1172 vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1
1173 vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2
1174 vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3
1175 vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4
1176 vmovdqu64 [%%T_IV + (0*64)], %%T5
1177 vmovdqu64 [%%T_IV + (1*64)], %%T6
1178
1179 ;; calculate last block case mask
1180 ;; - first block case requires no modifications to in/out/IV
1181 vmovdqu64 %%T1, [STATE + _des_args_BLen]
1182 vpcmpd k2, %%T1, %%T10, 4 ; NEQ
1183 kmovw DWORD(IA1), k2
1184 and DWORD(IA1), DWORD(IA0)
1185 jz %%_des_cfb_one_no_last_blocks
1186
1187 ;; set up IV, in and out for the last block case
1188 ;; - Last block needs in and out to be set differently (decryption only)
1189 ;; - IA1 holds the last block mask
1190 %ifidn %%ENC_DEC, DEC
1191 mov DWORD(IA0), DWORD(IA1)
1192 mov DWORD(IA2), DWORD(IA1)
1193 shr DWORD(IA1), 8
1194 and DWORD(IA2), 0xff
1195 kmovw k4, DWORD(IA2)
1196 kmovw k5, DWORD(IA1)
1197 vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)]
1198 vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)]
1199 vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)]
1200 vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)]
1201 vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1
1202 vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2
1203 vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3
1204 vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4
1205 %endif ; decryption
1206 ;; - IV has to be set differently for CFB as well
1207 ;; - IA0 holds the last block mask
1208 %assign IDX 0
1209 %rep 16
1210 test DWORD(IA0), (1 << IDX)
1211 jz %%_des_cfb_one_copy_iv_next %+ IDX
1212 %ifidn %%ENC_DEC, ENC
1213 mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)]
1214 %else
1215 mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)]
1216 %endif
1217 mov IA2, [IA2 - 8]
1218 mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2)
1219 shr IA2, 32
1220 mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2)
1221 %%_des_cfb_one_copy_iv_next %+ IDX:
1222 %assign IDX (IDX + 1)
1223 %endrep
1224
1225 %%_des_cfb_one_no_last_blocks:
1226 ;; Uffff ... finally let's do some DES CFB
1227 ;; - let's use T_IN, T_OUT, T_IV and T_MASK
1228
1229 ;; - load data with the corresponding masks & transpose
1230 ;; - T0 to T15 will hold the data
1231 xor IA0, IA0
1232 %assign IDX 0
1233 %assign K_IDX 1
1234 %rep 16
1235 mov IA1, [%%T_IN + (IDX*PTR_SZ)]
1236 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1237 kmovq k %+ K_IDX, IA0
1238 vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1]
1239 %assign IDX (IDX + 1)
1240 %assign K_IDX (K_IDX + 1)
1241 %if K_IDX > 7
1242 %assign K_IDX 1 ; iterate through K1 to K7
1243 %endif
1244 %endrep
1245 ;; - transpose the data in T0 to T15, T16 to T23 are clobbered
1246 TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23
1247
1248 ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1
1249 vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0
1250 vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1
1251 ;; DES encrypt
1252 ;; - R0 - %%T0
1253 ;; - L0 - %%T1
1254 DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13
1255 ;; CFB style xor with R0/L0 with IV
1256 ;; - IV0 - %%T16
1257 ;; - IV1 - %%T17
1258 vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1
1259 vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0
1260 vmovdqa64 %%T1, %%T2
1261 ;; - new R0 = L0 ^ IV0 (%%T0)
1262 ;; - new L0 = R0 ^ IV1 (%%T1)
1263
1264 ;; Transpose the data out
1265 ;; - %%T2 to %%T24 clobbered
1266 TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24
1267
1268 ;; Store the transposed data
1269 ;; - T0 to T15 will hold the data
1270 xor IA0, IA0
1271 %assign IDX 0
1272 %assign K_IDX 1
1273 %rep 16
1274 mov IA1, [%%T_OUT + (IDX*PTR_SZ)]
1275 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1276 kmovq k %+ K_IDX, IA0
1277 vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX
1278 %assign IDX (IDX + 1)
1279 %assign K_IDX (K_IDX + 1)
1280 %if K_IDX > 7
1281 %assign K_IDX 1 ; iterate through K1 to K7
1282 %endif
1283 %endrep
1284
1285 %%_des_cfb_one_end:
1286
1287 %endmacro
1288
1289 ;;; ===========================================================================
1290 ;;; Converts length into mask of DES blocks
1291 ;;; ===========================================================================
1292 ;;;
1293 ;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64)
1294 ;;; USES: IA0, IA1 IA2
1295 ;;; ASSUMES: SIZE - OFFSET < 64
1296 %macro GET_MASK8 1
1297 %define %%MASK %1
1298
1299 %ifidn IA1, rcx
1300 %define myrcx IA1
1301 %else
1302 %define myrcx rcx
1303 mov IA1, rcx
1304 %endif
1305 mov myrcx, SIZE
1306 sub myrcx, OFFSET
1307 ;; - myrcx - remaining length
1308 ;; - divide by 8 (DES block size)
1309 ;; - create bit mask of the result
1310 mov DWORD(%%MASK), 1
1311 shr DWORD(myrcx), 3
1312 shl DWORD(%%MASK), BYTE(myrcx)
1313 sub DWORD(%%MASK), 1
1314 %ifnidn IA1, rcx
1315 mov rcx, IA1
1316 %endif
1317 %endmacro
1318
1319 ;;; ===========================================================================
1320 ;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1321 ;;; ===========================================================================
1322 ;;;
1323 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1324 ;;; DES_KS [in] - pointer to transposed key schedule
1325 ;;;
1326 ;;; NOTE: clobbers OpMask registers
1327 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1328 %macro GEN_DES_ENC_CIPHER 2
1329 %define %%NUM_DES_BLOCKS %1
1330 %define %%DES_KS %2
1331
1332 %assign RN 0
1333 %assign LN 1
1334 %assign RNN 2
1335 %assign LNN 3
1336 %rep %%NUM_DES_BLOCKS - 1
1337 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1338 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1339 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1340 %assign RN (RN + 2)
1341 %assign LN (LN + 2)
1342 %assign RNN (RNN + 2)
1343 %assign LNN (LNN + 2)
1344 %endrep
1345 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1346 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1347 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1348 %endmacro
1349
1350 ;;; ===========================================================================
1351 ;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1352 ;;; ===========================================================================
1353 ;;;
1354 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1355 ;;; DES_KS [in] - pointer to transposed key schedule
1356 ;;;
1357 ;;; NOTE: clobbers OpMask registers
1358 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1359 %macro GEN_DES_DEC_CIPHER 2
1360 %define %%NUM_DES_BLOCKS %1
1361 %define %%DES_KS %2
1362
1363 %assign RN 0
1364 %assign LN 1
1365 %rep %%NUM_DES_BLOCKS
1366 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1367 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1368 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1369 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1370 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1371 vmovdqa64 ZIV0, ZTMP12
1372 vmovdqa64 ZIV1, ZTMP13
1373 %assign RN (RN + 2)
1374 %assign LN (LN + 2)
1375 %endrep
1376 %endmacro
1377
1378 ;;; ===========================================================================
1379 ;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1380 ;;; ===========================================================================
1381 ;;;
1382 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1383 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1384 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1385 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1386 ;;;
1387 ;;; NOTE: clobbers OpMask registers
1388 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1389 %macro GEN_3DES_ENC_CIPHER 4
1390 %define %%NUM_DES_BLOCKS %1
1391 %define %%DES_KS1 %2
1392 %define %%DES_KS2 %3
1393 %define %%DES_KS3 %4
1394
1395 %assign RN 0
1396 %assign LN 1
1397 %assign RNN 2
1398 %assign LNN 3
1399 %rep %%NUM_DES_BLOCKS
1400 ;; ENC
1401 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1402 ;; DEC
1403 DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1404 ;; ENC
1405 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1406 %if (RNN < (%%NUM_DES_BLOCKS * 2))
1407 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1408 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1409 %else
1410 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1411 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1412 %endif
1413
1414 %assign RN (RN + 2)
1415 %assign LN (LN + 2)
1416 %assign RNN (RNN + 2)
1417 %assign LNN (LNN + 2)
1418 %endrep
1419
1420 %endmacro
1421
1422 ;;; ===========================================================================
1423 ;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1424 ;;; ===========================================================================
1425 ;;;
1426 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1427 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1428 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1429 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1430 ;;;
1431 ;;; NOTE: clobbers OpMask registers
1432 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1433 %macro GEN_3DES_DEC_CIPHER 4
1434 %define %%NUM_DES_BLOCKS %1
1435 %define %%DES_KS1 %2
1436 %define %%DES_KS2 %3
1437 %define %%DES_KS3 %4
1438
1439 %assign RN 0
1440 %assign LN 1
1441 %rep %%NUM_DES_BLOCKS
1442 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1443 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1444 ;; DEC
1445 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1446 ;; ENC
1447 DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1448 ;; DEC
1449 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1450 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1451 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1452 vmovdqa64 ZIV0, ZTMP12
1453 vmovdqa64 ZIV1, ZTMP13
1454
1455 %assign RN (RN + 2)
1456 %assign LN (LN + 2)
1457 %endrep
1458
1459 %endmacro
1460
1461 ;;; ===========================================================================
1462 ;;; DES CBC / DOCSIS DES ENCRYPT
1463 ;;; ===========================================================================
1464 ;;;
1465 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1466 ;;; 3DES (3DES CBC)
1467 ;;;
1468 ;;; NOTE: clobbers OpMask registers
1469 %macro GENERIC_DES_ENC 1
1470 %define %%DES_DOCSIS %1
1471
1472 ;; push the registers and allocate the stack frame
1473 mov rax, rsp
1474 sub rsp, STACKFRAME_size
1475 and rsp, -64
1476 mov [rsp + _rsp_save], rax ; original SP
1477 mov [rsp + _gpr_save + 0*8], r12
1478 mov [rsp + _gpr_save + 1*8], r13
1479 mov [rsp + _gpr_save + 2*8], r14
1480 mov [rsp + _gpr_save + 3*8], r15
1481
1482 %ifnidn %%DES_DOCSIS, 3DES
1483 ;; DES and DOCSIS DES
1484 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1485 %else
1486 ;; 3DES
1487 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC
1488 %endif
1489 mov [rsp + _size_save], SIZE
1490 and SIZE, -64
1491 xor OFFSET, OFFSET
1492 ;; This loop processes message in blocks of 64 bytes.
1493 ;; Anything smaller than 64 bytes is handled separately after the loop.
1494 %%_gen_des_enc_loop:
1495 cmp OFFSET, SIZE
1496 jz %%_gen_des_enc_loop_end
1497 ;; run loads
1498 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1499 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1500 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1501 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1502 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1503 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1504 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1505 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1506 vmovdqu64 ZW0, [IA0 + OFFSET]
1507 vmovdqu64 ZW1, [IA1 + OFFSET]
1508 vmovdqu64 ZW2, [IA2 + OFFSET]
1509 vmovdqu64 ZW3, [INP0 + OFFSET]
1510 vmovdqu64 ZW4, [INP1 + OFFSET]
1511 vmovdqu64 ZW5, [INP2 + OFFSET]
1512 vmovdqu64 ZW6, [INP3 + OFFSET]
1513 vmovdqu64 ZW7, [INP4 + OFFSET]
1514
1515 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1516 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1517 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1518 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1519 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1520 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1521 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1522 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1523 vmovdqu64 ZW8, [IA0 + OFFSET]
1524 vmovdqu64 ZW9, [IA1 + OFFSET]
1525 vmovdqu64 ZW10, [IA2 + OFFSET]
1526 vmovdqu64 ZW11, [INP0 + OFFSET]
1527 vmovdqu64 ZW12, [INP1 + OFFSET]
1528 vmovdqu64 ZW13, [INP2 + OFFSET]
1529 vmovdqu64 ZW14, [INP3 + OFFSET]
1530 vmovdqu64 ZW15, [INP4 + OFFSET]
1531
1532 ;; Transpose input
1533 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1534
1535 ;; DES CBC ENC comes here
1536 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1537 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1538
1539 %ifnidn %%DES_DOCSIS, 3DES
1540 GEN_DES_ENC_CIPHER 8, rsp + _key_sched
1541 %else
1542 GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1543 %endif
1544
1545 ;; transpose data on output
1546 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1547 ;; run stores
1548 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1549 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1550 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1551 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1552 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1553 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1554 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1555 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1556 vmovdqu64 [IA0 + OFFSET], ZW0
1557 vmovdqu64 [IA1 + OFFSET], ZW1
1558 vmovdqu64 [IA2 + OFFSET], ZW2
1559 vmovdqu64 [INP0 + OFFSET], ZW3
1560 vmovdqu64 [INP1 + OFFSET], ZW4
1561 vmovdqu64 [INP2 + OFFSET], ZW5
1562 vmovdqu64 [INP3 + OFFSET], ZW6
1563 vmovdqu64 [INP4 + OFFSET], ZW7
1564
1565 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1566 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1567 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1568 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1569 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1570 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1571 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1572 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1573 vmovdqu64 [IA0 + OFFSET], ZW8
1574 vmovdqu64 [IA1 + OFFSET], ZW9
1575 vmovdqu64 [IA2 + OFFSET], ZW10
1576 vmovdqu64 [INP0 + OFFSET], ZW11
1577 vmovdqu64 [INP1 + OFFSET], ZW12
1578 vmovdqu64 [INP2 + OFFSET], ZW13
1579 vmovdqu64 [INP3 + OFFSET], ZW14
1580 vmovdqu64 [INP4 + OFFSET], ZW15
1581
1582 add OFFSET, 64
1583 jmp %%_gen_des_enc_loop
1584 %%_gen_des_enc_loop_end:
1585 ;; This is where we check if there is anything less than 64 bytes
1586 ;; of message left for processing.
1587 mov SIZE, [rsp + _size_save]
1588 cmp OFFSET, SIZE
1589 jz %%_gen_des_enc_part_end
1590 ;; calculate min of bytes_left and 64, convert to qword mask
1591 GET_MASK8 IA0 ; IA0 = mask
1592
1593 kmovw k7, DWORD(IA0)
1594 mov [rsp + _mask_save], IA0
1595 ;; run masked loads
1596 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1597 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1598 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1599 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1600 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1601 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1602 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1603 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1604 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1605 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1606 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1607 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1608 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1609 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1610 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1611 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1612
1613 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1614 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1615 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1616 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1617 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1618 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1619 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1620 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1621 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1622 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1623 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1624 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1625 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1626 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1627 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1628 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1629
1630 ;; Transpose input
1631 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1632
1633 ;; DES CBC ENC comes here
1634 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1635 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1636
1637 mov IA0, [rsp + _mask_save]
1638 cmp BYTE(IA0), 0x0f
1639 ja %%_gt_4
1640 jz %%_blocks_4
1641
1642 cmp BYTE(IA0), 0x03
1643 ja %%_blocks_3
1644 jz %%_blocks_2
1645
1646 ;; process one block and move to transpose out
1647 %ifnidn %%DES_DOCSIS, 3DES
1648 GEN_DES_ENC_CIPHER 1, rsp + _key_sched
1649 %else
1650 GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1651 %endif
1652 jmp %%_transpose_out
1653
1654 %%_blocks_2:
1655 ;; process two blocks and move to transpose out
1656 %ifnidn %%DES_DOCSIS, 3DES
1657 GEN_DES_ENC_CIPHER 2, rsp + _key_sched
1658 %else
1659 GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1660 %endif
1661 jmp %%_transpose_out
1662
1663 %%_blocks_3:
1664 ;; process three blocks and move to transpose out
1665 %ifnidn %%DES_DOCSIS, 3DES
1666 GEN_DES_ENC_CIPHER 3, rsp + _key_sched
1667 %else
1668 GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1669 %endif
1670 jmp %%_transpose_out
1671
1672 %%_blocks_4:
1673 ;; process four blocks and move to transpose out
1674 %ifnidn %%DES_DOCSIS, 3DES
1675 GEN_DES_ENC_CIPHER 4, rsp + _key_sched
1676 %else
1677 GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1678 %endif
1679 jmp %%_transpose_out
1680
1681 %%_gt_4:
1682 cmp BYTE(IA0), 0x3f
1683 ja %%_blocks_7
1684 jz %%_blocks_6
1685 %%_blocks_5:
1686 ;; process five blocks and move to transpose out
1687 %ifnidn %%DES_DOCSIS, 3DES
1688 GEN_DES_ENC_CIPHER 5, rsp + _key_sched
1689 %else
1690 GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1691 %endif
1692 jmp %%_transpose_out
1693
1694 %%_blocks_6:
1695 ;; process six blocks and move to transpose out
1696 %ifnidn %%DES_DOCSIS, 3DES
1697 GEN_DES_ENC_CIPHER 6, rsp + _key_sched
1698 %else
1699 GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1700 %endif
1701 jmp %%_transpose_out
1702
1703 %%_blocks_7:
1704 ;; process seven blocks and move to transpose out
1705 %ifnidn %%DES_DOCSIS, 3DES
1706 GEN_DES_ENC_CIPHER 7, rsp + _key_sched
1707 %else
1708 GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1709 %endif
1710
1711 %%_transpose_out:
1712 ;; transpose data on output
1713 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1714
1715 ;; run masked stores
1716 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1717 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1718 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1719 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1720 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1721 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1722 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1723 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1724 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
1725 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
1726 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
1727 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
1728 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
1729 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
1730 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
1731 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
1732
1733 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1734 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1735 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1736 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1737 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1738 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1739 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1740 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1741 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
1742 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
1743 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
1744 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
1745 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
1746 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
1747 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
1748 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
1749 %%_gen_des_enc_part_end:
1750
1751 ;; store IV and update pointers
1752 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
1753
1754 ;; CFB part for DOCSIS
1755 %ifidn %%DES_DOCSIS, DOCSIS
1756 DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1757 %endif
1758 ;; restore stack pointer and registers
1759 mov r12, [rsp + _gpr_save + 0*8]
1760 mov r13, [rsp + _gpr_save + 1*8]
1761 mov r14, [rsp + _gpr_save + 2*8]
1762 mov r15, [rsp + _gpr_save + 3*8]
1763 mov rsp, [rsp + _rsp_save] ; original SP
1764 %endmacro
1765
1766 ;;; ===========================================================================
1767 ;;; DES CBC / DOCSIS DES DECRYPT
1768 ;;; ===========================================================================
1769 ;;;
1770 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1771 ;;; 3DES (3DES CBC)
1772 ;;;
1773 ;;; NOTE: clobbers OpMask registers
1774 %macro GENERIC_DES_DEC 1
1775 %define %%DES_DOCSIS %1
1776
1777 ;; push the registers and allocate the stack frame
1778 mov rax, rsp
1779 sub rsp, STACKFRAME_size
1780 and rsp, -64
1781 mov [rsp + _rsp_save], rax ; original SP
1782 mov [rsp + _gpr_save + 0*8], r12
1783 mov [rsp + _gpr_save + 1*8], r13
1784 mov [rsp + _gpr_save + 2*8], r14
1785 mov [rsp + _gpr_save + 3*8], r15
1786
1787 %ifnidn %%DES_DOCSIS, 3DES
1788 ;; DES and DOCSIS
1789 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1790 %else
1791 ;; 3DES
1792 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC
1793 %endif
1794
1795 ;; CFB part for DOCSIS
1796 %ifidn %%DES_DOCSIS, DOCSIS
1797 DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1798 %endif
1799
1800 mov [rsp + _size_save], SIZE
1801 and SIZE, -64
1802 xor OFFSET, OFFSET
1803 ;; This loop processes message in blocks of 64 bytes.
1804 ;; Anything smaller than 64 bytes is handled separately after the loop.
1805 %%_gen_des_dec_loop:
1806 cmp OFFSET, SIZE
1807 jz %%_gen_des_dec_loop_end
1808 ;; run loads
1809 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1810 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1811 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1812 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1813 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1814 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1815 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1816 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1817 vmovdqu64 ZW0, [IA0 + OFFSET]
1818 vmovdqu64 ZW1, [IA1 + OFFSET]
1819 vmovdqu64 ZW2, [IA2 + OFFSET]
1820 vmovdqu64 ZW3, [INP0 + OFFSET]
1821 vmovdqu64 ZW4, [INP1 + OFFSET]
1822 vmovdqu64 ZW5, [INP2 + OFFSET]
1823 vmovdqu64 ZW6, [INP3 + OFFSET]
1824 vmovdqu64 ZW7, [INP4 + OFFSET]
1825
1826 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1827 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1828 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1829 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1830 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1831 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1832 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1833 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1834 vmovdqu64 ZW8, [IA0 + OFFSET]
1835 vmovdqu64 ZW9, [IA1 + OFFSET]
1836 vmovdqu64 ZW10, [IA2 + OFFSET]
1837 vmovdqu64 ZW11, [INP0 + OFFSET]
1838 vmovdqu64 ZW12, [INP1 + OFFSET]
1839 vmovdqu64 ZW13, [INP2 + OFFSET]
1840 vmovdqu64 ZW14, [INP3 + OFFSET]
1841 vmovdqu64 ZW15, [INP4 + OFFSET]
1842
1843 ;; Transpose input
1844 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1845
1846 %ifnidn %%DES_DOCSIS, 3DES
1847 ;; DES CBC DEC comes here
1848 GEN_DES_DEC_CIPHER 8, rsp + _key_sched
1849 %else
1850 ;; 3DES CBC DEC comes here
1851 GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1852 %endif
1853
1854 ;; transpose data on output
1855 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1856
1857 ;; run stores
1858 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1859 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1860 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1861 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1862 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1863 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1864 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1865 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1866 vmovdqu64 [IA0 + OFFSET], ZW0
1867 vmovdqu64 [IA1 + OFFSET], ZW1
1868 vmovdqu64 [IA2 + OFFSET], ZW2
1869 vmovdqu64 [INP0 + OFFSET], ZW3
1870 vmovdqu64 [INP1 + OFFSET], ZW4
1871 vmovdqu64 [INP2 + OFFSET], ZW5
1872 vmovdqu64 [INP3 + OFFSET], ZW6
1873 vmovdqu64 [INP4 + OFFSET], ZW7
1874
1875 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1876 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1877 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1878 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1879 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1880 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1881 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1882 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1883 vmovdqu64 [IA0 + OFFSET], ZW8
1884 vmovdqu64 [IA1 + OFFSET], ZW9
1885 vmovdqu64 [IA2 + OFFSET], ZW10
1886 vmovdqu64 [INP0 + OFFSET], ZW11
1887 vmovdqu64 [INP1 + OFFSET], ZW12
1888 vmovdqu64 [INP2 + OFFSET], ZW13
1889 vmovdqu64 [INP3 + OFFSET], ZW14
1890 vmovdqu64 [INP4 + OFFSET], ZW15
1891
1892 add OFFSET, 64
1893 jmp %%_gen_des_dec_loop
1894 %%_gen_des_dec_loop_end:
1895 ;; This is where we check if there is anything less than 64 bytes
1896 ;; of message left for processing.
1897 mov SIZE, [rsp + _size_save]
1898 cmp OFFSET, SIZE
1899 jz %%_gen_des_dec_part_end
1900 ;; calculate min of bytes_left and 64, convert to qword mask
1901 GET_MASK8 IA0 ; IA0 = mask
1902
1903 kmovw k7, DWORD(IA0)
1904 mov [rsp + _mask_save], IA0
1905 ;; run masked loads
1906 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1907 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1908 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1909 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1910 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1911 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1912 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1913 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1914 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1915 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1916 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1917 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1918 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1919 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1920 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1921 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1922
1923 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1924 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1925 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1926 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1927 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1928 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1929 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1930 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1931 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1932 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1933 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1934 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1935 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1936 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1937 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1938 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1939
1940 ;; Transpose input
1941 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1942
1943 ;; DES CBC DEC comes here
1944 mov IA0, [rsp + _mask_save]
1945 cmp BYTE(IA0), 0x0f
1946 ja %%_gt_4
1947 jz %%_blocks_4
1948
1949 cmp BYTE(IA0), 0x03
1950 ja %%_blocks_3
1951 jz %%_blocks_2
1952 ;; process one block and move to transpose out
1953 %ifnidn %%DES_DOCSIS, 3DES
1954 GEN_DES_DEC_CIPHER 1, rsp + _key_sched
1955 %else
1956 GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1957 %endif
1958 jmp %%_transpose_out
1959
1960 %%_blocks_2:
1961 ;; process two blocks and move to transpose out
1962 %ifnidn %%DES_DOCSIS, 3DES
1963 GEN_DES_DEC_CIPHER 2, rsp + _key_sched
1964 %else
1965 GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1966 %endif
1967 jmp %%_transpose_out
1968
1969 %%_blocks_3:
1970 ;; process three blocks and move to transpose out
1971 %ifnidn %%DES_DOCSIS, 3DES
1972 GEN_DES_DEC_CIPHER 3, rsp + _key_sched
1973 %else
1974 GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1975 %endif
1976 jmp %%_transpose_out
1977
1978 %%_blocks_4:
1979 ;; process four blocks and move to transpose out
1980 %ifnidn %%DES_DOCSIS, 3DES
1981 GEN_DES_DEC_CIPHER 4, rsp + _key_sched
1982 %else
1983 GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1984 %endif
1985 jmp %%_transpose_out
1986
1987 %%_gt_4:
1988 cmp BYTE(IA0), 0x3f
1989 ja %%_blocks_7
1990 jz %%_blocks_6
1991 %%_blocks_5:
1992 ;; process five blocks and move to transpose out
1993 %ifnidn %%DES_DOCSIS, 3DES
1994 GEN_DES_DEC_CIPHER 5, rsp + _key_sched
1995 %else
1996 GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1997 %endif
1998 jmp %%_transpose_out
1999
2000 %%_blocks_6:
2001 ;; process six blocks and move to transpose out
2002 %ifnidn %%DES_DOCSIS, 3DES
2003 GEN_DES_DEC_CIPHER 6, rsp + _key_sched
2004 %else
2005 GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2006 %endif
2007 jmp %%_transpose_out
2008
2009 %%_blocks_7:
2010 ;; process seven blocks and move to transpose out
2011 %ifnidn %%DES_DOCSIS, 3DES
2012 GEN_DES_DEC_CIPHER 7, rsp + _key_sched
2013 %else
2014 GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2015 %endif
2016
2017 %%_transpose_out:
2018 ;; transpose data on output
2019 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
2020
2021 ;; run masked stores
2022 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
2023 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
2024 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
2025 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
2026 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
2027 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
2028 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
2029 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
2030 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
2031 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
2032 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
2033 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
2034 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
2035 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
2036 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
2037 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
2038
2039 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
2040 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
2041 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
2042 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
2043 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
2044 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
2045 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
2046 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
2047 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
2048 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
2049 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
2050 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
2051 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
2052 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
2053 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
2054 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
2055 %%_gen_des_dec_part_end:
2056
2057 ;; store IV and update pointers
2058 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
2059
2060 ;; restore stack pointer and registers
2061 mov r12, [rsp + _gpr_save + 0*8]
2062 mov r13, [rsp + _gpr_save + 1*8]
2063 mov r14, [rsp + _gpr_save + 2*8]
2064 mov r15, [rsp + _gpr_save + 3*8]
2065 mov rsp, [rsp + _rsp_save] ; original SP
2066 %endmacro
2067
2068
2069 ;;; ========================================================
2070 ;;; DATA
2071
2072 section .data
2073 default rel
2074 align 64
2075 mask_values:
2076 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2077 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2078 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2079 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2080 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2081 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2082 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2083 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2084 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2085 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2086 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2087 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2088 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2089 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2090 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2091 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2092 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2093 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2094 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2095 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2096 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2097 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2098 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2099 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2100 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2101 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2102 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2103 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2104 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2105 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2106 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2107 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2108 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2109 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2110 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2111 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2112 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2113 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2114 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2115 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2116 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2117 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2118 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2119 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2120 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2121 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2122 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2123 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2124 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2125 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2126 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2127 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2128 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2129 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2130 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2131 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2132 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2133 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2134 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2135 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2136 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2137 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2138 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2139 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2140 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2141 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2142 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2143 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2144 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2145 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2146 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2147 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2148 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2149 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2150 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2151 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2152
2153 align 64
2154 init_perm_consts:
2155 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2156 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2157 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2158 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2159 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2160 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2161 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2162 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2163 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2164 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2165 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2166 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2167 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2168 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2169 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2170 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2171 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2172 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2173 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2174 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2175
2176 ;;; S-Box table
2177 align 64
2178 S_box_flipped:
2179 ;; SBOX0
2180 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c
2181 dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a
2182 dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05
2183 dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00
2184 dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05
2185 dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06
2186 dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00
2187 dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b
2188 ;; SBOX1
2189 dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09
2190 dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04
2191 dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c
2192 dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f
2193 dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00
2194 dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07
2195 dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a
2196 dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09
2197 ;; SBOX2
2198 dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a
2199 dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07
2200 dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05
2201 dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e
2202 dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d
2203 dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04
2204 dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a
2205 dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03
2206 ;; SBOX3
2207 dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a
2208 dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01
2209 dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04
2210 dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02
2211 dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03
2212 dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04
2213 dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e
2214 dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07
2215 ;; SBOX4
2216 dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06
2217 dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00
2218 dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c
2219 dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07
2220 dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05
2221 dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a
2222 dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02
2223 dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c
2224 ;; SBOX5
2225 dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08
2226 dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d
2227 dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b
2228 dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06
2229 dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06
2230 dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01
2231 dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00
2232 dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b
2233 ;; SBOX6
2234 dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00
2235 dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09
2236 dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a
2237 dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04
2238 dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07
2239 dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c
2240 dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04
2241 dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03
2242 ;; SBOX7
2243 dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f
2244 dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a
2245 dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c
2246 dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01
2247 dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c
2248 dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06
2249 dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a
2250 dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d
2251
2252 ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1
2253 align 64
2254 vec_ones_32b:
2255 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
2256
2257 align 64
2258 and_eu:
2259 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2260 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2261 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2262 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2263
2264 align 64
2265 and_ed:
2266 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2267 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2268 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2269 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2270
2271 align 64
2272 idx_e:
2273 dq 0x0d0c090805040100, 0x0f0e0b0a07060302
2274 dq 0x1d1c191815141110, 0x1f1e1b1a17161312
2275 dq 0x2d2c292825242120, 0x2f2e2b2a27262322
2276 dq 0x3d3c393835343130, 0x3f3e3b3a37363332
2277
2278 align 64
2279 reg_values16bit_7:
2280 dq 0x001f001f001f001f, 0x001f001f001f001f
2281 dq 0x001f001f001f001f, 0x001f001f001f001f
2282 dq 0x001f001f001f001f, 0x001f001f001f001f
2283 dq 0x001f001f001f001f, 0x001f001f001f001f
2284
2285 align 64
2286 shuffle_reg:
2287 dq 0x0705060403010200, 0x0f0d0e0c0b090a08
2288 dq 0x1715161413111210, 0x1f1d1e1c1b191a18
2289 dq 0x2725262423212220, 0x2f2d2e2c2b292a28
2290 dq 0x3735363433313230, 0x3f3d3e3c3b393a38
2291
2292 ;;; ========================================================
2293 ;;; CODE
2294 section .text
2295
2296 ;;; arg 1 : pointer to DES OOO structure
2297 ;;; arg 2 : size in bytes
2298 align 64
2299 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal)
2300 des_x16_cbc_enc_avx512:
2301 GENERIC_DES_ENC DES
2302 ret
2303
2304 ;;; arg 1 : pointer to DES OOO structure
2305 ;;; arg 2 : size in bytes
2306 align 64
2307 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal)
2308 des_x16_cbc_dec_avx512:
2309 GENERIC_DES_DEC DES
2310 ret
2311
2312 ;;; arg 1 : pointer to DES OOO structure
2313 ;;; arg 2 : size in bytes
2314 align 64
2315 MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal)
2316 des3_x16_cbc_enc_avx512:
2317 GENERIC_DES_ENC 3DES
2318 ret
2319
2320 ;;; arg 1 : pointer to DES OOO structure
2321 ;;; arg 2 : size in bytes
2322 align 64
2323 MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal)
2324 des3_x16_cbc_dec_avx512:
2325 GENERIC_DES_DEC 3DES
2326 ret
2327
2328 ;;; arg 1 : pointer to DES OOO structure
2329 ;;; arg 2 : size in bytes
2330 align 64
2331 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal)
2332 docsis_des_x16_enc_avx512:
2333 GENERIC_DES_ENC DOCSIS
2334 ret
2335
2336 ;;; arg 1 : pointer to DES OOO structure
2337 ;;; arg 2 : size in bytes
2338 align 64
2339 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal)
2340 docsis_des_x16_dec_avx512:
2341 GENERIC_DES_DEC DOCSIS
2342 ret
2343
2344 %ifdef LINUX
2345 section .note.GNU-stack noalloc noexec nowrite progbits
2346 %endif