]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
656752941899c3c01f269eca4f60374cdebdd94a
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx512 / des_x16_avx512.asm
1 ;;
2 ;; Copyright (c) 2017-2019, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; Authors:
29 ;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2)
30 ;; (1) University of Haifa, Israel
31 ;; (2) Intel Corporation
32
33 ;; In System V AMD64 ABI
34 ;; calle saves: RBX, RBP, R12-R15
35 ;; Windows x64 ABI
36 ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
37
38 ;;
39 ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
40 ;; -----------------------------------------------------------
41 ;; Windows clobbers: RAX R8 R9 R10 R11
42 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
43 ;; -----------------------------------------------------------
44 ;; Linux clobbers: RAX RCX RDX R10 R11
45 ;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
46 ;; -----------------------------------------------------------
47 ;; Clobbers ZMM0-31 and K1 to K7
48
49 %include "include/os.asm"
50 %include "include/reg_sizes.asm"
51 %include "mb_mgr_datastruct.asm"
52 %include "constants.asm"
53 ;%define DO_DBGPRINT
54 %include "include/dbgprint.asm"
55
56 %ifdef LINUX
57 %define arg1 rdi
58 %define arg2 rsi
59 %define arg3 rdx
60 %define arg4 rcx
61 %else
62 %define arg1 rcx
63 %define arg2 rdx
64 %define arg3 r8
65 %define arg4 r9
66 %endif
67
68 %define STATE arg1
69 %define SIZE arg2
70
71 %define OFFSET rax
72
73 %define IA0 arg3
74 %define IA1 arg4
75 %define IA2 r10
76
77 %define INP0 r11
78 %define INP1 r12
79 %define INP2 r13
80 %define INP3 r14
81 %define INP4 r15
82
83 %define KSOFFSET r11
84
85 %define ZW0 zmm0
86 %define ZW1 zmm1
87 %define ZW2 zmm2
88 %define ZW3 zmm3
89 %define ZW4 zmm4
90 %define ZW5 zmm5
91 %define ZW6 zmm6
92 %define ZW7 zmm7
93 %define ZW8 zmm8
94 %define ZW9 zmm9
95 %define ZW10 zmm10
96 %define ZW11 zmm11
97 %define ZW12 zmm12
98 %define ZW13 zmm13
99 %define ZW14 zmm14
100 %define ZW15 zmm15
101
102 %define ZIV0 zmm16
103 %define ZIV1 zmm17
104
105 %define ZTMP0 zmm18
106 %define ZTMP1 zmm19
107 %define ZTMP2 zmm20
108 %define ZTMP3 zmm21
109 %define ZTMP4 zmm22
110 %define ZTMP5 zmm23
111 %define ZTMP6 zmm24
112 %define ZTMP7 zmm25
113 %define ZTMP8 zmm26
114 %define ZTMP9 zmm27
115 %define ZTMP10 zmm28
116 %define ZTMP11 zmm29
117 %define ZTMP12 zmm30
118 %define ZTMP13 zmm31
119
120 struc STACKFRAME
121 _key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
122 _key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
123 _key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
124 _tmp_iv: resq 16 ; 2 x 64 bytes
125 _tmp_in: resq 16 ; 2 x 64 bytes
126 _tmp_out: resq 16 ; 2 x 64 bytes
127 _tmp_mask: resd 16 ; 1 x 64 bytes
128 _gpr_save: resq 4 ; r12 to r15
129 _rsp_save: resq 1
130 _mask_save: resq 1
131 _size_save: resq 1
132 endstruc
133
134 ;;; ===========================================================================
135 ;;; ===========================================================================
136 ;;; MACROS
137 ;;; ===========================================================================
138 ;;; ===========================================================================
139
140 ;;; ===========================================================================
141 ;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected)
142 ;;; ===========================================================================
143 %macro CLEAR_KEY_SCHEDULE 2
144 %define %%ALG %1 ; [in] DES or 3DES
145 %define %%ZT %2 ; [clobbered] temporary ZMM register
146
147 %ifdef SAFE_DATA
148 vpxorq %%ZT, %%ZT
149 %assign rep_num (2048 / 64)
150 %ifidn %%ALG, 3DES
151 %assign rep_num (rep_num * 3)
152 %endif
153
154 %assign offset 0
155 %rep rep_num
156 vmovdqa64 [rsp + _key_sched + offset], %%ZT
157 %assign offset (offset + 64)
158 %endrep
159
160 %endif ; SAFE_DATA
161
162 %endmacro
163
164 ;;; ===========================================================================
165 ;;; PERMUTE
166 ;;; ===========================================================================
167 ;;; A [in/out] - zmm register
168 ;;; B [in/out] - zmm register
169 ;;; NSHIFT [in] - constant to shift words by
170 ;;; MASK [in] - zmm or m512 with mask
171 ;;; T0 [clobbered] - temporary zmm register
172 %macro PERMUTE 5
173 %define %%A %1
174 %define %%B %2
175 %define %%NSHIFT %3
176 %define %%MASK %4
177 %define %%T0 %5
178
179 vpsrld %%T0, %%A, %%NSHIFT
180 vpxord %%T0, %%T0, %%B
181 vpandd %%T0, %%T0, %%MASK
182 vpxord %%B, %%B, %%T0
183 vpslld %%T0, %%T0, %%NSHIFT
184 vpxord %%A, %%A, %%T0
185 %endmacro
186
187 ;;; ===========================================================================
188 ;;; INITIAL PERMUTATION
189 ;;; ===========================================================================
190 ;;; L [in/out] - zmm register
191 ;;; R [in/out] - zmm register
192 ;;; T0 [clobbered] - temporary zmm register
193 %macro IP_Z 3
194 %define %%L %1
195 %define %%R %2
196 %define %%T0 %3
197 PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0
198 PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0
199 PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0
200 PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0
201 PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0
202 %endmacro
203
204 ;;; ===========================================================================
205 ;;; FINAL PERMUTATION
206 ;;; ===========================================================================
207 ;;; L [in/out] - zmm register
208 ;;; R [in/out] - zmm register
209 ;;; T0 [clobbered] - temporary zmm register
210 %macro FP_Z 3
211 %define %%L %1
212 %define %%R %2
213 %define %%T0 %3
214 PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0
215 PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0
216 PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0
217 PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0
218 PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0
219 %endmacro
220
221 ;;; ===========================================================================
222 ;;; P PHASE
223 ;;; ===========================================================================
224 ;;; W0 [in/out] - zmm register
225 ;;; in: vector of 16 x 32bits from S phase
226 ;;; out: permuted in vector
227 ;;; T0-T3 [clobbered] - temporary zmm register
228 %macro P_PHASE 5
229 %define %%W0 %1
230 %define %%T0 %2
231 %define %%T1 %3
232 %define %%T2 %4
233 %define %%T3 %5
234
235 vprord %%T0, %%W0, 3
236 vpandd %%T0, %%T0, [rel mask_values + 0*64]
237 vprord %%T1, %%W0, 5
238 vpandd %%T1, %%T1, [rel mask_values + 1*64]
239 vpord %%T0, %%T0, %%T1
240
241 vprord %%T1, %%W0, 24
242 vpandd %%T1, %%T1, [rel mask_values + 2*64]
243 vprord %%T2, %%W0, 26
244 vpandd %%T2, %%T2, [rel mask_values + 3*64]
245 vpord %%T1, %%T1, %%T2
246 vpord %%T0, %%T0, %%T1
247
248 vprord %%T1, %%W0, 15
249 vpandd %%T1, %%T1, [rel mask_values + 4*64]
250 vprord %%T2, %%W0, 17
251 vpandd %%T2, %%T2, [rel mask_values + 5*64]
252 vpord %%T1, %%T1, %%T2
253
254 vprord %%T2, %%W0, 6
255 vpandd %%T2, %%T2, [rel mask_values + 6*64]
256 vprord %%T3, %%W0, 21
257 vpandd %%T3, %%T3, [rel mask_values + 7*64]
258 vpord %%T2, %%T2, %%T3
259 vpord %%T1, %%T1, %%T2
260 vpord %%T0, %%T0, %%T1
261
262 vprord %%T1, %%W0, 12
263 vpandd %%T1, %%T1, [rel mask_values + 8*64]
264 vprord %%T2, %%W0, 14
265 vpandd %%T2, %%T2, [rel mask_values + 9*64]
266 vpord %%T1, %%T1, %%T2
267
268 vprord %%T2, %%W0, 4
269 vpandd %%T2, %%T2, [rel mask_values + 10*64]
270 vprord %%T3, %%W0, 11
271 vpandd %%T3, %%T3, [rel mask_values + 11*64]
272 vpord %%T2, %%T2, %%T3
273 vpord %%T1, %%T1, %%T2
274 vpord %%T0, %%T0, %%T1
275
276 vprord %%T1, %%W0, 16
277 vpandd %%T1, %%T1, [rel mask_values + 12*64]
278 vprord %%T2, %%W0, 22
279 vpandd %%T2, %%T2, [rel mask_values + 13*64]
280 vpord %%T1, %%T1, %%T2
281
282 vprord %%T2, %%W0, 19
283 vpandd %%T2, %%T2, [rel mask_values + 14*64]
284 vprord %%T3, %%W0, 10
285 vpandd %%T3, %%T3, [rel mask_values + 15*64]
286 vpord %%T2, %%T2, %%T3
287 vpord %%T1, %%T1, %%T2
288 vpord %%T0, %%T0, %%T1
289
290 vprord %%T1, %%W0, 9
291 vpandd %%T1, %%T1, [rel mask_values + 16*64]
292 vprord %%T2, %%W0, 13
293 vpandd %%T2, %%T2, [rel mask_values + 17*64]
294 vpord %%T1, %%T1, %%T2
295
296 vprord %%T2, %%W0, 25
297 vpandd %%T2, %%T2, [rel mask_values + 18*64]
298 vpord %%T1, %%T1, %%T2
299 vpord %%W0, %%T0, %%T1
300 %endmacro
301
302 ;;; ===========================================================================
303 ;;; E PHASE
304 ;;; ===========================================================================
305 ;;;
306 ;;; Expands 16x32-bit words into 16x48-bit words
307 ;;; plus XOR's result with the key schedule.
308 ;;; The output is adjusted to be friendly as S phase input.
309 ;;;
310 ;;; in [in] - zmm register
311 ;;; out0a [out] - zmm register
312 ;;; out0b [out] - zmm register
313 ;;; out1a [out] - zmm register
314 ;;; out1b [out] - zmm register
315 ;;; k0 [in] - key schedule; zmm or m512
316 ;;; k1 [in] - key schedule; zmm or m512
317 ;;; t0-t1 [clobbered] - temporary zmm register
318 %macro E_PHASE 9
319 %define %%IN %1
320 %define %%OUT0A %2
321 %define %%OUT0B %3
322 %define %%OUT1A %4
323 %define %%OUT1B %5
324 %define %%K0 %6
325 %define %%K1 %7
326 %define %%T0 %8
327 %define %%T1 %9
328
329 vprord %%T0, %%IN, 31
330 vprord %%T1, %%IN, 3
331 vpshufb %%T0, %%T0, [rel idx_e]
332 vpshufb %%T1, %%T1, [rel idx_e]
333 vpunpcklbw %%OUT0A, %%T0, %%T1
334 vpunpckhbw %%OUT1A, %%T0, %%T1
335 vpxord %%OUT0A, %%OUT0A, %%K0
336 vpxord %%OUT1A, %%OUT1A, %%K1
337 vpandd %%OUT0B, %%OUT0A, [rel and_eu]
338 vpsrlw %%OUT0B, %%OUT0B, 8
339 vpandd %%OUT0A, %%OUT0A, [rel and_ed]
340 vpandd %%OUT1B, %%OUT1A, [rel and_eu]
341 vpsrlw %%OUT1B, %%OUT1B, 8
342 vpandd %%OUT1A, %%OUT1A, [rel and_ed]
343 %endmacro
344
345 ;;; ===========================================================================
346 ;;; S-BOX
347 ;;; ===========================================================================
348 ;;;
349 ;;; NOTE: clobbers k1-k6 OpMask registers
350 ;;;
351 ;;; IN0A [in] - zmm register; output from E-phase
352 ;;; IN0B [in] - zmm register; output from E-phase
353 ;;; IN1A [in] - zmm register; output from E-phase
354 ;;; IN1B [in] - zmm register; output from E-phase
355 ;;; OUT [out] - zmm register; output from E-phase
356 ;;; T0-T5 [clobbered] - temporary zmm register
357 %macro S_PHASE 11
358 %define %%IN0A %1
359 %define %%IN0B %2
360 %define %%IN1A %3
361 %define %%IN1B %4
362 %define %%OUT %5
363 %define %%T0 %6
364 %define %%T1 %7
365 %define %%T2 %8
366 %define %%T3 %9
367 %define %%T4 %10
368 %define %%T5 %11
369
370 vmovdqa64 %%T0, [rel reg_values16bit_7]
371 vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE
372 vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE
373 vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE
374 vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE
375
376 mov DWORD(IA0), 0x55555555
377 kmovd k1, DWORD(IA0)
378 mov DWORD(IA0), 0xaaaaaaaa
379 kmovd k2, DWORD(IA0)
380
381 vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64]
382 vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64]
383 vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64]
384 vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64]
385 vpxord %%T0, %%T0, %%T2
386 vpxord %%OUT, %%T1, %%T3
387 vmovdqu16 %%OUT{k3}, %%T0
388
389 vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64]
390 vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64]
391 vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64]
392 vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64]
393 vpxord %%T0, %%T0, %%T2
394 vpxord %%T3, %%T1, %%T3
395 vmovdqu16 %%T3{k4}, %%T0
396 vpsllw %%T3, %%T3, 4
397 vpxord %%OUT, %%OUT, %%T3
398
399 vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64]
400 vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64]
401 vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64]
402 vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64]
403 vpxord %%T0, %%T0, %%T2
404 vpxord %%T4, %%T1, %%T3
405 vmovdqu16 %%T4{k5}, %%T0
406
407 vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64]
408 vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64]
409 vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64]
410 vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64]
411 vpxord %%T0, %%T0, %%T2
412 vpxord %%T5, %%T1, %%T3
413 vmovdqu16 %%T5{k6}, %%T0
414 vpsllw %%T5, %%T5, 4
415
416 vpxord %%T4, %%T4, %%T5
417 vpsllw %%T4, %%T4, 8
418 vpxord %%OUT, %%OUT, %%T4
419 vpshufb %%OUT, %%OUT, [rel shuffle_reg]
420 %endmacro
421
422 ;;; ===========================================================================
423 ;;; DES encryption/decryption round
424 ;;; ===========================================================================
425 ;;;
426 ;;; Clobbers k1-k6 OpMask registers
427 ;;;
428 ;;; ENC_DEC [in] - ENC for encryption, DEC for decryption
429 ;;; R [in/out] - zmm register; plain text in & cipher text out
430 ;;; L [in/out] - zmm register; plain text in & cipher text out
431 ;;; KS [in] - pointer to the key schedule
432 ;;; T0-T11 [clobbered] - temporary zmm register
433 %macro DES_ENC_DEC 16
434 %define %%ENC_DEC %1
435 %define %%R %2
436 %define %%L %3
437 %define %%KS %4
438 %define %%T0 %5
439 %define %%T1 %6
440 %define %%T2 %7
441 %define %%T3 %8
442 %define %%T4 %9
443 %define %%T5 %10
444 %define %%T6 %11
445 %define %%T7 %12
446 %define %%T8 %13
447 %define %%T9 %14
448 %define %%T10 %15
449 %define %%T11 %16
450
451 IP_Z %%R, %%L, %%T0
452
453 %ifidn %%ENC_DEC, ENC
454 ;; ENCRYPTION
455 xor KSOFFSET, KSOFFSET
456 %%_des_enc_loop:
457 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7
458 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
459 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
460 vpxord %%L, %%L, %%T0
461
462 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7
463 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
464 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
465 vpxord %%R, %%R, %%T0
466
467 add KSOFFSET, (4*64)
468 cmp KSOFFSET, (8*(4*64))
469 jb %%_des_enc_loop
470
471 %else
472 ;; DECRYPTION
473 mov KSOFFSET, (8*(4*64))
474 %%_des_dec_loop:
475 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7
476 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
477 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
478 vpxord %%L, %%L, %%T0
479
480 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7
481 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
482 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
483 vpxord %%R, %%R, %%T0
484 sub KSOFFSET, (4*64)
485 jnz %%_des_dec_loop
486 %endif ; DECRYPTION
487
488 FP_Z %%R, %%L, %%T0
489 %endmacro
490
491 ;;; ===========================================================================
492 ;;; DATA TRANSPOSITION AT DATA INPUT
493 ;;; ===========================================================================
494 ;;;
495 ;;; IN00 - IN15 [in/out]:
496 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
497 ;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
498 ;;; T0-T3 [clobbered] - temporary zmm registers
499 ;;; K0-K5 [clobbered] - temporary zmm registers
500 ;;; H0-H3 [clobbered] - temporary zmm registers
501 %macro TRANSPOSE_IN 30
502 %define %%IN00 %1 ; R0
503 %define %%IN01 %2 ; L0
504 %define %%IN02 %3 ; R1
505 %define %%IN03 %4 ; L1
506 %define %%IN04 %5 ; R2
507 %define %%IN05 %6 ; L2
508 %define %%IN06 %7 ; R3
509 %define %%IN07 %8 ; L3
510 %define %%IN08 %9 ; R4
511 %define %%IN09 %10 ; L4
512 %define %%IN10 %11 ; R5
513 %define %%IN11 %12 ; L5
514 %define %%IN12 %13 ; R6
515 %define %%IN13 %14 ; L6
516 %define %%IN14 %15 ; R7
517 %define %%IN15 %16 ; L7
518 %define %%T0 %17
519 %define %%T1 %18
520 %define %%T2 %19
521 %define %%T3 %20
522 %define %%K0 %21
523 %define %%K1 %22
524 %define %%K2 %23
525 %define %%K3 %24
526 %define %%K4 %25
527 %define %%K5 %26
528 %define %%H0 %27
529 %define %%H1 %28
530 %define %%H2 %29
531 %define %%H3 %30
532
533 vpunpckldq %%K0, %%IN00, %%IN01
534 vpunpckhdq %%K1, %%IN00, %%IN01
535 vpunpckldq %%T0, %%IN02, %%IN03
536 vpunpckhdq %%T1, %%IN02, %%IN03
537
538 vpunpckldq %%IN00, %%IN04, %%IN05
539 vpunpckhdq %%IN01, %%IN04, %%IN05
540 vpunpckldq %%IN02, %%IN06, %%IN07
541 vpunpckhdq %%IN03, %%IN06, %%IN07
542
543 vpunpcklqdq %%K2, %%K0, %%T0
544 vpunpckhqdq %%T2, %%K0, %%T0
545 vpunpcklqdq %%K3, %%K1, %%T1
546 vpunpckhqdq %%T3, %%K1, %%T1
547
548 vpunpcklqdq %%K0, %%IN00, %%IN02
549 vpunpckhqdq %%K1, %%IN00, %%IN02
550 vpunpcklqdq %%T0, %%IN01, %%IN03
551 vpunpckhqdq %%T1, %%IN01, %%IN03
552
553 vpunpckldq %%K4, %%IN08, %%IN09
554 vpunpckhdq %%K5, %%IN08, %%IN09
555 vpunpckldq %%IN04, %%IN10, %%IN11
556 vpunpckhdq %%IN05, %%IN10, %%IN11
557 vpunpckldq %%IN06, %%IN12, %%IN13
558 vpunpckhdq %%IN07, %%IN12, %%IN13
559 vpunpckldq %%IN10, %%IN14, %%IN15
560 vpunpckhdq %%IN11, %%IN14, %%IN15
561
562 vpunpcklqdq %%IN12, %%K4, %%IN04
563 vpunpckhqdq %%IN13, %%K4, %%IN04
564 vpunpcklqdq %%IN14, %%K5, %%IN05
565 vpunpckhqdq %%IN15, %%K5, %%IN05
566 vpunpcklqdq %%IN00, %%IN06, %%IN10
567 vpunpckhqdq %%IN01, %%IN06, %%IN10
568 vpunpcklqdq %%IN02, %%IN07, %%IN11
569 vpunpckhqdq %%IN03, %%IN07, %%IN11
570
571 vshufi64x2 %%H0, %%K2, %%K0, 0x44
572 vshufi64x2 %%H1, %%K2, %%K0, 0xee
573 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
574 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
575 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
576 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
577 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
578 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
579
580 vshufi64x2 %%H0, %%T2, %%K1, 0x44
581 vshufi64x2 %%H1, %%T2, %%K1, 0xee
582 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
583 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
584 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
585 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
586 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
587 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
588
589 vshufi64x2 %%H0, %%K3, %%T0, 0x44
590 vshufi64x2 %%H1, %%K3, %%T0, 0xee
591 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
592 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
593 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
594 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
595 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
596 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
597
598 vshufi64x2 %%H0, %%T3, %%T1, 0x44
599 vshufi64x2 %%H1, %%T3, %%T1, 0xee
600 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
601 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
602 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
603 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
604 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
605 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
606 %endmacro
607
608 ;;; ===========================================================================
609 ;;; DATA TRANSPOSITION AT DATA OUTPUT
610 ;;; ===========================================================================
611 ;;;
612 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
613 ;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
614 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
615 ;;; T0-T3 [clobbered] - temporary zmm registers
616 ;;; K0-K5 [clobbered] - temporary zmm registers
617 ;;; H0-H3 [clobbered] - temporary zmm registers
618 %macro TRANSPOSE_OUT 30
619 %define %%IN00 %1 ; R0
620 %define %%IN01 %2 ; L0
621 %define %%IN02 %3 ; R1
622 %define %%IN03 %4 ; L1
623 %define %%IN04 %5 ; R2
624 %define %%IN05 %6 ; L2
625 %define %%IN06 %7 ; R3
626 %define %%IN07 %8 ; L3
627 %define %%IN08 %9 ; R4
628 %define %%IN09 %10 ; L4
629 %define %%IN10 %11 ; R5
630 %define %%IN11 %12 ; L5
631 %define %%IN12 %13 ; R6
632 %define %%IN13 %14 ; L6
633 %define %%IN14 %15 ; R7
634 %define %%IN15 %16 ; L7
635 %define %%T0 %17
636 %define %%T1 %18
637 %define %%T2 %19
638 %define %%T3 %20
639 %define %%K0 %21
640 %define %%K1 %22
641 %define %%K2 %23
642 %define %%K3 %24
643 %define %%K4 %25
644 %define %%K5 %26
645 %define %%H0 %27
646 %define %%H1 %28
647 %define %%H2 %29
648 %define %%H3 %30
649
650 vpunpckldq %%K0, %%IN01, %%IN00
651 vpunpckhdq %%K1, %%IN01, %%IN00
652 vpunpckldq %%T0, %%IN03, %%IN02
653 vpunpckhdq %%T1, %%IN03, %%IN02
654
655 vpunpckldq %%IN00, %%IN05, %%IN04
656 vpunpckhdq %%IN01, %%IN05, %%IN04
657 vpunpckldq %%IN02, %%IN07, %%IN06
658 vpunpckhdq %%IN03, %%IN07, %%IN06
659
660 vpunpcklqdq %%K2, %%K0, %%T0
661 vpunpckhqdq %%T2, %%K0, %%T0
662 vpunpcklqdq %%K3, %%K1, %%T1
663 vpunpckhqdq %%T3, %%K1, %%T1
664
665 vpunpcklqdq %%K0, %%IN00, %%IN02
666 vpunpckhqdq %%K1, %%IN00, %%IN02
667 vpunpcklqdq %%T0, %%IN01, %%IN03
668 vpunpckhqdq %%T1, %%IN01, %%IN03
669
670 vpunpckldq %%K4, %%IN09, %%IN08
671 vpunpckhdq %%K5, %%IN09, %%IN08
672 vpunpckldq %%IN04, %%IN11, %%IN10
673 vpunpckhdq %%IN05, %%IN11, %%IN10
674 vpunpckldq %%IN06, %%IN13, %%IN12
675 vpunpckhdq %%IN07, %%IN13, %%IN12
676 vpunpckldq %%IN10, %%IN15, %%IN14
677 vpunpckhdq %%IN11, %%IN15, %%IN14
678
679 vpunpcklqdq %%IN12, %%K4, %%IN04
680 vpunpckhqdq %%IN13, %%K4, %%IN04
681 vpunpcklqdq %%IN14, %%K5, %%IN05
682 vpunpckhqdq %%IN15, %%K5, %%IN05
683 vpunpcklqdq %%IN00, %%IN06, %%IN10
684 vpunpckhqdq %%IN01, %%IN06, %%IN10
685 vpunpcklqdq %%IN02, %%IN07, %%IN11
686 vpunpckhqdq %%IN03, %%IN07, %%IN11
687
688 vshufi64x2 %%H0, %%K2, %%K0, 0x44
689 vshufi64x2 %%H1, %%K2, %%K0, 0xee
690 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
691 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
692 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
693 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
694 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
695 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
696
697 vshufi64x2 %%H0, %%T2, %%K1, 0x44
698 vshufi64x2 %%H1, %%T2, %%K1, 0xee
699 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
700 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
701 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
702 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
703 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
704 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
705
706 vshufi64x2 %%H0, %%K3, %%T0, 0x44
707 vshufi64x2 %%H1, %%K3, %%T0, 0xee
708 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
709 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
710 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
711 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
712 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
713 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
714
715 vshufi64x2 %%H0, %%T3, %%T1, 0x44
716 vshufi64x2 %%H1, %%T3, %%T1, 0xee
717 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
718 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
719 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
720 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
721 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
722 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
723 %endmacro
724
725 ;;; ===========================================================================
726 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT
727 ;;; ===========================================================================
728 ;;;
729 ;;; IN00-IN15 / R0/L0-R7/L7 [in/out]:
730 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
731 ;;; out: R0 - 16 x word0, L0 - 16 x word1
732 ;;; T0,T2 [clobbered] - temporary zmm registers
733 ;;; K0-K4 [clobbered] - temporary zmm registers
734 ;;; H0,H2 [clobbered] - temporary zmm registers
735 %macro TRANSPOSE_IN_ONE 24
736 %define %%IN00 %1 ; R0
737 %define %%IN01 %2 ; L0
738 %define %%IN02 %3 ; R1
739 %define %%IN03 %4 ; L1
740 %define %%IN04 %5 ; R2
741 %define %%IN05 %6 ; L2
742 %define %%IN06 %7 ; R3
743 %define %%IN07 %8 ; L3
744 %define %%IN08 %9 ; R4
745 %define %%IN09 %10 ; L4
746 %define %%IN10 %11 ; R5
747 %define %%IN11 %12 ; L5
748 %define %%IN12 %13 ; R6
749 %define %%IN13 %14 ; L6
750 %define %%IN14 %15 ; R7
751 %define %%IN15 %16 ; L7
752 %define %%T0 %17
753 %define %%T2 %18
754 %define %%K0 %19
755 %define %%K1 %20
756 %define %%K2 %21
757 %define %%K4 %22
758 %define %%H0 %23
759 %define %%H2 %24
760
761 vpunpckldq %%K0, %%IN00, %%IN01
762 vpunpckhdq %%K1, %%IN00, %%IN01
763 vpunpckldq %%T0, %%IN02, %%IN03
764
765 vpunpckldq %%IN00, %%IN04, %%IN05
766 vpunpckhdq %%IN01, %%IN04, %%IN05
767 vpunpckldq %%IN02, %%IN06, %%IN07
768
769 vpunpcklqdq %%K2, %%K0, %%T0
770 vpunpckhqdq %%T2, %%K0, %%T0
771
772 vpunpcklqdq %%K0, %%IN00, %%IN02
773 vpunpckhqdq %%K1, %%IN00, %%IN02
774
775 vpunpckldq %%K4, %%IN08, %%IN09
776 vpunpckldq %%IN04, %%IN10, %%IN11
777 vpunpckldq %%IN06, %%IN12, %%IN13
778 vpunpckldq %%IN10, %%IN14, %%IN15
779
780 vpunpcklqdq %%IN12, %%K4, %%IN04
781 vpunpckhqdq %%IN13, %%K4, %%IN04
782 vpunpcklqdq %%IN00, %%IN06, %%IN10
783 vpunpckhqdq %%IN01, %%IN06, %%IN10
784
785 vshufi64x2 %%H0, %%K2, %%K0, 0x44
786 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
787 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
788
789 vshufi64x2 %%H0, %%T2, %%K1, 0x44
790 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
791 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
792 %endmacro
793
794 ;;; ===========================================================================
795 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT
796 ;;; ===========================================================================
797 ;;;
798 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
799 ;;; in: R0 - 16 x word0, L0 - 16 x word1
800 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
801 ;;; T0-T3 [clobbered] - temporary zmm registers
802 ;;; K0-K3 [clobbered] - temporary zmm registers
803 ;;; H0,H1 [clobbered] - temporary zmm registers
804 %macro TRANSPOSE_OUT_ONE 25
805 %define %%IN00 %1 ; R0
806 %define %%IN01 %2 ; L0
807 %define %%IN02 %3 ; R1
808 %define %%IN03 %4 ; L1
809 %define %%IN04 %5 ; R2
810 %define %%IN05 %6 ; L2
811 %define %%IN06 %7 ; R3
812 %define %%IN07 %8 ; L3
813 %define %%IN08 %9 ; R4
814 %define %%IN09 %10 ; L4
815 %define %%IN10 %11 ; R5
816 %define %%IN11 %12 ; L5
817 %define %%IN12 %13 ; R6
818 %define %%IN13 %14 ; L6
819 %define %%IN14 %15 ; R7
820 %define %%IN15 %16 ; L7
821 %define %%T0 %17
822 %define %%T2 %18
823 %define %%T3 %19
824 %define %%K0 %20
825 %define %%K1 %21
826 %define %%K2 %22
827 %define %%K3 %23
828 %define %%H0 %24
829 %define %%H1 %25
830
831 vpxord %%T0, %%T0, %%T0
832
833 vpunpckldq %%K0, %%IN01, %%IN00
834 vpunpckhdq %%K1, %%IN01, %%IN00
835
836 vpunpcklqdq %%K2, %%K0, %%T0
837 vpunpckhqdq %%T2, %%K0, %%T0
838 vpunpcklqdq %%K3, %%K1, %%T0
839 vpunpckhqdq %%T3, %%K1, %%T0
840
841 vshufi64x2 %%H0, %%K2, %%T0, 0x44
842 vshufi64x2 %%H1, %%K2, %%T0, 0xee
843 vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0
844 vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2
845 vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4
846 vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6
847
848 vshufi64x2 %%H0, %%T2, %%T0, 0x44
849 vshufi64x2 %%H1, %%T2, %%T0, 0xee
850 vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0
851 vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2
852 vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4
853 vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6
854
855 vshufi64x2 %%H0, %%K3, %%T0, 0x44
856 vshufi64x2 %%H1, %%K3, %%T0, 0xee
857 vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1
858 vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3
859 vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5
860 vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7
861
862 vshufi64x2 %%H0, %%T3, %%T0, 0x44
863 vshufi64x2 %%H1, %%T3, %%T0, 0xee
864 vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1
865 vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3
866 vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5
867 vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7
868 %endmacro
869
870 ;;; ===========================================================================
871 ;;; DES INITIALIZATION
872 ;;; key schedule transposition and IV set up
873 ;;; ===========================================================================
874 ;;;
875 ;;; STATE_KEYS [in] - KEYS in DES OOO STATE
876 ;;; STATE_IV [ in] - IV in DES OOO STATE
877 ;;; KS [out] - place to store transposed key schedule or NULL
878 ;;; IV0 [out] - r512; initialization vector
879 ;;; IV1 [out] - r512; initialization vector
880 ;;; T0-T27 [clobbered] - temporary r512
881 %macro DES_INIT 33
882 %define %%STATE_KEYS %1
883 %define %%STATE_IV %2
884 %define %%KS %3
885 %define %%IV0 %4
886 %define %%IV1 %5
887 %define %%T0 %6
888 %define %%T1 %7
889 %define %%T2 %8
890 %define %%T3 %9
891 %define %%T4 %10
892 %define %%T5 %11
893 %define %%T6 %12
894 %define %%T7 %13
895 %define %%T8 %14
896 %define %%T9 %15
897 %define %%T10 %16
898 %define %%T11 %17
899 %define %%T12 %18
900 %define %%T13 %19
901 %define %%T14 %20
902 %define %%T15 %21
903 %define %%T16 %22
904 %define %%T17 %23
905 %define %%T18 %24
906 %define %%T19 %25
907 %define %%T20 %26
908 %define %%T21 %27
909 %define %%T22 %28
910 %define %%T23 %29
911 %define %%T24 %30
912 %define %%T25 %31
913 %define %%T26 %32
914 %define %%T27 %33
915
916 ;; set up the key schedule
917 ;; - load first half of the keys & transpose
918 ;; - transpose and store
919 ;; note: we can use IV registers as temprary ones here
920 %assign IDX 0
921 %rep 16
922 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
923 vmovdqu64 %%T %+ IDX, [IA0]
924 %assign IDX (IDX + 1)
925 %endrep
926 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
927 %assign IDX 0
928 %rep 16
929 vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX
930 %assign IDX (IDX + 1)
931 %endrep
932 ;; - load second half of the keys & transpose
933 ;; - transpose and store
934 ;; note: we can use IV registers as temprary ones here
935 %assign IDX 0
936 %rep 16
937 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
938 vmovdqu64 %%T %+ IDX, [IA0 + 64]
939 %assign IDX (IDX + 1)
940 %endrep
941 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
942 %assign IDX 0
943 %rep 16
944 vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX
945 %assign IDX (IDX + 1)
946 %endrep
947
948 ;; set up IV
949 ;; - they are already kept transposed so this is enough to load them
950 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
951 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
952 %endmacro
953
954 ;;; ===========================================================================
955 ;;; 3DES INITIALIZATION
956 ;;; key schedule transposition and IV set up
957 ;;; ===========================================================================
958 ;;;
959 ;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE
960 ;;; STATE_IV [ in] - IV in 3DES OOO STATE
961 ;;; KS1 [out] - place to store transposed key schedule or NULL
962 ;;; KS2 [out] - place to store transposed key schedule or NULL
963 ;;; KS3 [out] - place to store transposed key schedule or NULL
964 ;;; IV0 [out] - r512; initialization vector
965 ;;; IV1 [out] - r512; initialization vector
966 ;;; T0-T27 [clobbered] - temporary r512
967 ;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec)
968 %macro DES3_INIT 36
969 %define %%STATE_KEYS %1
970 %define %%STATE_IV %2
971 %define %%KS1 %3
972 %define %%KS2 %4
973 %define %%KS3 %5
974 %define %%IV0 %6
975 %define %%IV1 %7
976 %define %%T0 %8
977 %define %%T1 %9
978 %define %%T2 %10
979 %define %%T3 %11
980 %define %%T4 %12
981 %define %%T5 %13
982 %define %%T6 %14
983 %define %%T7 %15
984 %define %%T8 %16
985 %define %%T9 %17
986 %define %%T10 %18
987 %define %%T11 %19
988 %define %%T12 %20
989 %define %%T13 %21
990 %define %%T14 %22
991 %define %%T15 %23
992 %define %%T16 %24
993 %define %%T17 %25
994 %define %%T18 %26
995 %define %%T19 %27
996 %define %%T20 %28
997 %define %%T21 %29
998 %define %%T22 %30
999 %define %%T23 %31
1000 %define %%T24 %32
1001 %define %%T25 %33
1002 %define %%T26 %34
1003 %define %%T27 %35
1004 %define %%DIR %36
1005
1006 %ifidn %%DIR, ENC
1007 %assign KEY_IDX 0
1008 %else
1009 %assign KEY_IDX 2
1010 %endif
1011 %assign KS_IDX 1
1012
1013 %rep 3
1014 ;; set up the key schedule
1015 ;; - load first half of the keys & transpose
1016 ;; - transpose and store
1017 ;; note: we can use IV registers as temprary ones here
1018
1019 %assign IDX 0
1020 %rep 16
1021 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
1022 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
1023 vmovdqu64 %%T %+ IDX, [IA0]
1024 %assign IDX (IDX + 1)
1025 %endrep
1026 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1027 %assign IDX 0
1028 %rep 16
1029 vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX
1030 %assign IDX (IDX + 1)
1031 %endrep
1032 ;; - load second half of the keys & transpose
1033 ;; - transpose and store
1034 ;; note: we can use IV registers as temprary ones here
1035 %assign IDX 0
1036 %rep 16
1037 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
1038 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
1039 vmovdqu64 %%T %+ IDX, [IA0 + 64]
1040 %assign IDX (IDX + 1)
1041 %endrep
1042 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1043 %assign IDX 0
1044 %rep 16
1045 vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX
1046 %assign IDX (IDX + 1)
1047 %endrep
1048
1049 %ifidn %%DIR, ENC
1050 %assign KEY_IDX (KEY_IDX + 1)
1051 %else
1052 %assign KEY_IDX (KEY_IDX - 1)
1053 %endif
1054 %assign KS_IDX (KS_IDX + 1)
1055 %endrep ; KEY_IDX / KS_IDX
1056
1057 ;; set up IV
1058 ;; - they are already kept transposed so this is enough to load them
1059 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
1060 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
1061
1062 %endmacro
1063
1064 ;;; ===========================================================================
1065 ;;; DES FINISH
1066 ;;; Update in/out pointers and store IV
1067 ;;; ===========================================================================
1068 ;;;
1069 ;;; Needs: STATE & SIZE
1070 ;;; IV0 [in] - r512; initialization vector
1071 ;;; IV1 [in] - r512; initialization vector
1072 ;;; T0-T4 [clobbered] - temporary r512 registers
1073 %macro DES_FINISH 7
1074 %define %%IV0 %1
1075 %define %%IV1 %2
1076 %define %%T0 %3
1077 %define %%T1 %4
1078 %define %%T2 %5
1079 %define %%T3 %6
1080 %define %%T4 %7
1081
1082 vpbroadcastq %%T4, SIZE
1083 vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)]
1084 vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)]
1085 vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)]
1086 vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)]
1087 vpaddq %%T0, %%T0, %%T4
1088 vpaddq %%T1, %%T1, %%T4
1089 vpaddq %%T2, %%T2, %%T4
1090 vpaddq %%T3, %%T3, %%T4
1091 vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0
1092 vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1
1093 vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2
1094 vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3
1095
1096 vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0
1097 vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1
1098 %endmacro
1099
1100 ;;; ===========================================================================
1101 ;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY
1102 ;;; ===========================================================================
1103 ;;;
1104 ;;; Needs: STATE, IA0-IA2
1105 ;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection
1106 ;;; KS [in] - key schedule
1107 ;;; T0-T24 [clobbered] - temporary r512
1108 ;;; T_IN [in] - 16 * 8 byte storage
1109 ;;; T_OUT [in] - 16 * 8 byte storage
1110 ;;; T_MASK [in] - 16 * 4 byte storage
1111 ;;; T_IV [in] - 16 * 8 byte storage
1112 ;;;
1113 ;;; NOTE: clobbers OpMask registers
1114 %macro DES_CFB_ONE 31
1115 %define %%ENC_DEC %1
1116 %define %%KS %2
1117 %define %%T0 %3
1118 %define %%T1 %4
1119 %define %%T2 %5
1120 %define %%T3 %6
1121 %define %%T4 %7
1122 %define %%T5 %8
1123 %define %%T6 %9
1124 %define %%T7 %10
1125 %define %%T8 %11
1126 %define %%T9 %12
1127 %define %%T10 %13
1128 %define %%T11 %14
1129 %define %%T12 %15
1130 %define %%T13 %16
1131 %define %%T14 %17
1132 %define %%T15 %18
1133 %define %%T16 %19
1134 %define %%T17 %20
1135 %define %%T18 %21
1136 %define %%T19 %22
1137 %define %%T20 %23
1138 %define %%T21 %24
1139 %define %%T22 %25
1140 %define %%T23 %26
1141 %define %%T24 %27
1142 %define %%T_IN %28
1143 %define %%T_OUT %29
1144 %define %%T_IV %30
1145 %define %%T_MASK %31
1146
1147 ;; - find mask for non-zero partial lengths
1148 vpxord %%T10, %%T10, %%T10
1149 vmovdqu64 %%T0, [STATE + _des_args_PLen]
1150 vpcmpd k3, %%T0, %%T10, 4 ; NEQ
1151 kmovw DWORD(IA0), k3
1152 movzx DWORD(IA0), WORD(IA0)
1153 or DWORD(IA0), DWORD(IA0)
1154 jz %%_des_cfb_one_end ; no non-zero partial lengths
1155
1156 %ifidn %%ENC_DEC, ENC
1157 ;; For encyrption case we need to make sure that
1158 ;; all full blocks are complete before proceeding
1159 ;; with CFB partial block.
1160 ;; To do that current out position is compared against
1161 ;; calculated last full block position.
1162 vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)]
1163 vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)]
1164 vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)]
1165 vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)]
1166 vpcmpq k4, %%T1, %%T2, 0 ; EQ
1167 vpcmpq k5, %%T3, %%T4, 0 ; EQ
1168 kmovw DWORD(IA1), k4
1169 movzx DWORD(IA1), BYTE(IA1)
1170 kmovw DWORD(IA2), k5
1171 movzx DWORD(IA2), BYTE(IA2)
1172 shl DWORD(IA2), 8
1173 or DWORD(IA2), DWORD(IA1)
1174 and DWORD(IA0), DWORD(IA2)
1175 jz %%_des_cfb_one_end ; no non-zero lengths left
1176 kmovw k3, DWORD(IA0)
1177 %endif
1178 ;; Calculate ((1 << partial_bytes) - 1)
1179 ;; in order to get the mask for loads and stores
1180 ;; k3 & IA0 - hold valid mask
1181 vmovdqa64 %%T1, [rel vec_ones_32b]
1182 vpsllvd %%T2{k3}{z}, %%T1, %%T0
1183 vpsubd %%T2{k3}{z}, %%T2, %%T1
1184 vmovdqu64 [%%T_MASK], %%T2
1185
1186 ;; clear selected partial lens not to do them twice
1187 vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10
1188
1189 ;; copy IV, in and out pointers
1190 vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)]
1191 vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)]
1192 vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)]
1193 vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)]
1194 vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)]
1195 vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)]
1196 vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1
1197 vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2
1198 vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3
1199 vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4
1200 vmovdqu64 [%%T_IV + (0*64)], %%T5
1201 vmovdqu64 [%%T_IV + (1*64)], %%T6
1202
1203 ;; calculate last block case mask
1204 ;; - first block case requires no modifications to in/out/IV
1205 vmovdqu64 %%T1, [STATE + _des_args_BLen]
1206 vpcmpd k2, %%T1, %%T10, 4 ; NEQ
1207 kmovw DWORD(IA1), k2
1208 and DWORD(IA1), DWORD(IA0)
1209 jz %%_des_cfb_one_no_last_blocks
1210
1211 ;; set up IV, in and out for the last block case
1212 ;; - Last block needs in and out to be set differently (decryption only)
1213 ;; - IA1 holds the last block mask
1214 %ifidn %%ENC_DEC, DEC
1215 mov DWORD(IA0), DWORD(IA1)
1216 mov DWORD(IA2), DWORD(IA1)
1217 shr DWORD(IA1), 8
1218 and DWORD(IA2), 0xff
1219 kmovw k4, DWORD(IA2)
1220 kmovw k5, DWORD(IA1)
1221 vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)]
1222 vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)]
1223 vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)]
1224 vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)]
1225 vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1
1226 vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2
1227 vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3
1228 vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4
1229 %endif ; decryption
1230 ;; - IV has to be set differently for CFB as well
1231 ;; - IA0 holds the last block mask
1232 %assign IDX 0
1233 %rep 16
1234 test DWORD(IA0), (1 << IDX)
1235 jz %%_des_cfb_one_copy_iv_next %+ IDX
1236 %ifidn %%ENC_DEC, ENC
1237 mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)]
1238 %else
1239 mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)]
1240 %endif
1241 mov IA2, [IA2 - 8]
1242 mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2)
1243 shr IA2, 32
1244 mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2)
1245 %%_des_cfb_one_copy_iv_next %+ IDX:
1246 %assign IDX (IDX + 1)
1247 %endrep
1248
1249 %%_des_cfb_one_no_last_blocks:
1250 ;; Uffff ... finally let's do some DES CFB
1251 ;; - let's use T_IN, T_OUT, T_IV and T_MASK
1252
1253 ;; - load data with the corresponding masks & transpose
1254 ;; - T0 to T15 will hold the data
1255 xor IA0, IA0
1256 %assign IDX 0
1257 %assign K_IDX 1
1258 %rep 16
1259 mov IA1, [%%T_IN + (IDX*PTR_SZ)]
1260 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1261 kmovq k %+ K_IDX, IA0
1262 vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1]
1263 %assign IDX (IDX + 1)
1264 %assign K_IDX (K_IDX + 1)
1265 %if K_IDX > 7
1266 %assign K_IDX 1 ; iterate through K1 to K7
1267 %endif
1268 %endrep
1269 ;; - transpose the data in T0 to T15, T16 to T23 are clobbered
1270 TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23
1271
1272 ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1
1273 vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0
1274 vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1
1275 ;; DES encrypt
1276 ;; - R0 - %%T0
1277 ;; - L0 - %%T1
1278 DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13
1279 ;; CFB style xor with R0/L0 with IV
1280 ;; - IV0 - %%T16
1281 ;; - IV1 - %%T17
1282 vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1
1283 vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0
1284 vmovdqa64 %%T1, %%T2
1285 ;; - new R0 = L0 ^ IV0 (%%T0)
1286 ;; - new L0 = R0 ^ IV1 (%%T1)
1287
1288 ;; Transpose the data out
1289 ;; - %%T2 to %%T24 clobbered
1290 TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24
1291
1292 ;; Store the transposed data
1293 ;; - T0 to T15 will hold the data
1294 xor IA0, IA0
1295 %assign IDX 0
1296 %assign K_IDX 1
1297 %rep 16
1298 mov IA1, [%%T_OUT + (IDX*PTR_SZ)]
1299 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1300 kmovq k %+ K_IDX, IA0
1301 vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX
1302 %assign IDX (IDX + 1)
1303 %assign K_IDX (K_IDX + 1)
1304 %if K_IDX > 7
1305 %assign K_IDX 1 ; iterate through K1 to K7
1306 %endif
1307 %endrep
1308
1309 %ifdef SAFE_DATA
1310 ;; Clear copied IV's
1311 vpxorq %%T5, %%T5
1312 vmovdqu64 [%%T_IV + (0*64)], %%T5
1313 vmovdqu64 [%%T_IV + (1*64)], %%T5
1314 %endif
1315
1316 %%_des_cfb_one_end:
1317
1318 %endmacro
1319
1320 ;;; ===========================================================================
1321 ;;; Converts length into mask of DES blocks
1322 ;;; ===========================================================================
1323 ;;;
1324 ;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64)
1325 ;;; USES: IA0, IA1 IA2
1326 ;;; ASSUMES: SIZE - OFFSET < 64
1327 %macro GET_MASK8 1
1328 %define %%MASK %1
1329
1330 %ifidn IA1, rcx
1331 %define myrcx IA1
1332 %else
1333 %define myrcx rcx
1334 mov IA1, rcx
1335 %endif
1336 mov myrcx, SIZE
1337 sub myrcx, OFFSET
1338 ;; - myrcx - remaining length
1339 ;; - divide by 8 (DES block size)
1340 ;; - create bit mask of the result
1341 mov DWORD(%%MASK), 1
1342 shr DWORD(myrcx), 3
1343 shl DWORD(%%MASK), BYTE(myrcx)
1344 sub DWORD(%%MASK), 1
1345 %ifnidn IA1, rcx
1346 mov rcx, IA1
1347 %endif
1348 %endmacro
1349
1350 ;;; ===========================================================================
1351 ;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1352 ;;; ===========================================================================
1353 ;;;
1354 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1355 ;;; DES_KS [in] - pointer to transposed key schedule
1356 ;;;
1357 ;;; NOTE: clobbers OpMask registers
1358 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1359 %macro GEN_DES_ENC_CIPHER 2
1360 %define %%NUM_DES_BLOCKS %1
1361 %define %%DES_KS %2
1362
1363 %assign RN 0
1364 %assign LN 1
1365 %assign RNN 2
1366 %assign LNN 3
1367 %rep %%NUM_DES_BLOCKS - 1
1368 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1369 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1370 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1371 %assign RN (RN + 2)
1372 %assign LN (LN + 2)
1373 %assign RNN (RNN + 2)
1374 %assign LNN (LNN + 2)
1375 %endrep
1376 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1377 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1378 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1379 %endmacro
1380
1381 ;;; ===========================================================================
1382 ;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1383 ;;; ===========================================================================
1384 ;;;
1385 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1386 ;;; DES_KS [in] - pointer to transposed key schedule
1387 ;;;
1388 ;;; NOTE: clobbers OpMask registers
1389 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1390 %macro GEN_DES_DEC_CIPHER 2
1391 %define %%NUM_DES_BLOCKS %1
1392 %define %%DES_KS %2
1393
1394 %assign RN 0
1395 %assign LN 1
1396 %rep %%NUM_DES_BLOCKS
1397 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1398 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1399 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1400 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1401 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1402 vmovdqa64 ZIV0, ZTMP12
1403 vmovdqa64 ZIV1, ZTMP13
1404 %assign RN (RN + 2)
1405 %assign LN (LN + 2)
1406 %endrep
1407 %endmacro
1408
1409 ;;; ===========================================================================
1410 ;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1411 ;;; ===========================================================================
1412 ;;;
1413 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1414 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1415 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1416 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1417 ;;;
1418 ;;; NOTE: clobbers OpMask registers
1419 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1420 %macro GEN_3DES_ENC_CIPHER 4
1421 %define %%NUM_DES_BLOCKS %1
1422 %define %%DES_KS1 %2
1423 %define %%DES_KS2 %3
1424 %define %%DES_KS3 %4
1425
1426 %assign RN 0
1427 %assign LN 1
1428 %assign RNN 2
1429 %assign LNN 3
1430 %rep %%NUM_DES_BLOCKS
1431 ;; ENC
1432 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1433 ;; DEC
1434 DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1435 ;; ENC
1436 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1437 %if (RNN < (%%NUM_DES_BLOCKS * 2))
1438 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1439 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1440 %else
1441 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1442 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1443 %endif
1444
1445 %assign RN (RN + 2)
1446 %assign LN (LN + 2)
1447 %assign RNN (RNN + 2)
1448 %assign LNN (LNN + 2)
1449 %endrep
1450
1451 %endmacro
1452
1453 ;;; ===========================================================================
1454 ;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1455 ;;; ===========================================================================
1456 ;;;
1457 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1458 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1459 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1460 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1461 ;;;
1462 ;;; NOTE: clobbers OpMask registers
1463 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1464 %macro GEN_3DES_DEC_CIPHER 4
1465 %define %%NUM_DES_BLOCKS %1
1466 %define %%DES_KS1 %2
1467 %define %%DES_KS2 %3
1468 %define %%DES_KS3 %4
1469
1470 %assign RN 0
1471 %assign LN 1
1472 %rep %%NUM_DES_BLOCKS
1473 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1474 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1475 ;; DEC
1476 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1477 ;; ENC
1478 DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1479 ;; DEC
1480 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1481 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1482 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1483 vmovdqa64 ZIV0, ZTMP12
1484 vmovdqa64 ZIV1, ZTMP13
1485
1486 %assign RN (RN + 2)
1487 %assign LN (LN + 2)
1488 %endrep
1489
1490 %endmacro
1491
1492 ;;; ===========================================================================
1493 ;;; DES CBC / DOCSIS DES ENCRYPT
1494 ;;; ===========================================================================
1495 ;;;
1496 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1497 ;;; 3DES (3DES CBC)
1498 ;;;
1499 ;;; NOTE: clobbers OpMask registers
1500 %macro GENERIC_DES_ENC 1
1501 %define %%DES_DOCSIS %1
1502
1503 ;; push the registers and allocate the stack frame
1504 mov rax, rsp
1505 sub rsp, STACKFRAME_size
1506 and rsp, -64
1507 mov [rsp + _rsp_save], rax ; original SP
1508 mov [rsp + _gpr_save + 0*8], r12
1509 mov [rsp + _gpr_save + 1*8], r13
1510 mov [rsp + _gpr_save + 2*8], r14
1511 mov [rsp + _gpr_save + 3*8], r15
1512
1513 %ifnidn %%DES_DOCSIS, 3DES
1514 ;; DES and DOCSIS DES
1515 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1516 %else
1517 ;; 3DES
1518 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC
1519 %endif
1520 mov [rsp + _size_save], SIZE
1521 and SIZE, -64
1522 xor OFFSET, OFFSET
1523 ;; This loop processes message in blocks of 64 bytes.
1524 ;; Anything smaller than 64 bytes is handled separately after the loop.
1525 %%_gen_des_enc_loop:
1526 cmp OFFSET, SIZE
1527 jz %%_gen_des_enc_loop_end
1528 ;; run loads
1529 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1530 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1531 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1532 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1533 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1534 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1535 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1536 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1537 vmovdqu64 ZW0, [IA0 + OFFSET]
1538 vmovdqu64 ZW1, [IA1 + OFFSET]
1539 vmovdqu64 ZW2, [IA2 + OFFSET]
1540 vmovdqu64 ZW3, [INP0 + OFFSET]
1541 vmovdqu64 ZW4, [INP1 + OFFSET]
1542 vmovdqu64 ZW5, [INP2 + OFFSET]
1543 vmovdqu64 ZW6, [INP3 + OFFSET]
1544 vmovdqu64 ZW7, [INP4 + OFFSET]
1545
1546 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1547 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1548 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1549 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1550 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1551 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1552 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1553 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1554 vmovdqu64 ZW8, [IA0 + OFFSET]
1555 vmovdqu64 ZW9, [IA1 + OFFSET]
1556 vmovdqu64 ZW10, [IA2 + OFFSET]
1557 vmovdqu64 ZW11, [INP0 + OFFSET]
1558 vmovdqu64 ZW12, [INP1 + OFFSET]
1559 vmovdqu64 ZW13, [INP2 + OFFSET]
1560 vmovdqu64 ZW14, [INP3 + OFFSET]
1561 vmovdqu64 ZW15, [INP4 + OFFSET]
1562
1563 ;; Transpose input
1564 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1565
1566 ;; DES CBC ENC comes here
1567 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1568 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1569
1570 %ifnidn %%DES_DOCSIS, 3DES
1571 GEN_DES_ENC_CIPHER 8, rsp + _key_sched
1572 %else
1573 GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1574 %endif
1575
1576 ;; transpose data on output
1577 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1578 ;; run stores
1579 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1580 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1581 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1582 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1583 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1584 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1585 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1586 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1587 vmovdqu64 [IA0 + OFFSET], ZW0
1588 vmovdqu64 [IA1 + OFFSET], ZW1
1589 vmovdqu64 [IA2 + OFFSET], ZW2
1590 vmovdqu64 [INP0 + OFFSET], ZW3
1591 vmovdqu64 [INP1 + OFFSET], ZW4
1592 vmovdqu64 [INP2 + OFFSET], ZW5
1593 vmovdqu64 [INP3 + OFFSET], ZW6
1594 vmovdqu64 [INP4 + OFFSET], ZW7
1595
1596 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1597 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1598 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1599 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1600 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1601 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1602 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1603 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1604 vmovdqu64 [IA0 + OFFSET], ZW8
1605 vmovdqu64 [IA1 + OFFSET], ZW9
1606 vmovdqu64 [IA2 + OFFSET], ZW10
1607 vmovdqu64 [INP0 + OFFSET], ZW11
1608 vmovdqu64 [INP1 + OFFSET], ZW12
1609 vmovdqu64 [INP2 + OFFSET], ZW13
1610 vmovdqu64 [INP3 + OFFSET], ZW14
1611 vmovdqu64 [INP4 + OFFSET], ZW15
1612
1613 add OFFSET, 64
1614 jmp %%_gen_des_enc_loop
1615 %%_gen_des_enc_loop_end:
1616 ;; This is where we check if there is anything less than 64 bytes
1617 ;; of message left for processing.
1618 mov SIZE, [rsp + _size_save]
1619 cmp OFFSET, SIZE
1620 jz %%_gen_des_enc_part_end
1621 ;; calculate min of bytes_left and 64, convert to qword mask
1622 GET_MASK8 IA0 ; IA0 = mask
1623
1624 kmovw k7, DWORD(IA0)
1625 mov [rsp + _mask_save], IA0
1626 ;; run masked loads
1627 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1628 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1629 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1630 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1631 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1632 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1633 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1634 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1635 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1636 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1637 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1638 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1639 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1640 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1641 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1642 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1643
1644 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1645 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1646 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1647 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1648 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1649 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1650 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1651 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1652 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1653 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1654 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1655 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1656 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1657 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1658 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1659 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1660
1661 ;; Transpose input
1662 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1663
1664 ;; DES CBC ENC comes here
1665 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1666 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1667
1668 mov IA0, [rsp + _mask_save]
1669 cmp BYTE(IA0), 0x0f
1670 ja %%_gt_4
1671 jz %%_blocks_4
1672
1673 cmp BYTE(IA0), 0x03
1674 ja %%_blocks_3
1675 jz %%_blocks_2
1676
1677 ;; process one block and move to transpose out
1678 %ifnidn %%DES_DOCSIS, 3DES
1679 GEN_DES_ENC_CIPHER 1, rsp + _key_sched
1680 %else
1681 GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1682 %endif
1683 jmp %%_transpose_out
1684
1685 %%_blocks_2:
1686 ;; process two blocks and move to transpose out
1687 %ifnidn %%DES_DOCSIS, 3DES
1688 GEN_DES_ENC_CIPHER 2, rsp + _key_sched
1689 %else
1690 GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1691 %endif
1692 jmp %%_transpose_out
1693
1694 %%_blocks_3:
1695 ;; process three blocks and move to transpose out
1696 %ifnidn %%DES_DOCSIS, 3DES
1697 GEN_DES_ENC_CIPHER 3, rsp + _key_sched
1698 %else
1699 GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1700 %endif
1701 jmp %%_transpose_out
1702
1703 %%_blocks_4:
1704 ;; process four blocks and move to transpose out
1705 %ifnidn %%DES_DOCSIS, 3DES
1706 GEN_DES_ENC_CIPHER 4, rsp + _key_sched
1707 %else
1708 GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1709 %endif
1710 jmp %%_transpose_out
1711
1712 %%_gt_4:
1713 cmp BYTE(IA0), 0x3f
1714 ja %%_blocks_7
1715 jz %%_blocks_6
1716 %%_blocks_5:
1717 ;; process five blocks and move to transpose out
1718 %ifnidn %%DES_DOCSIS, 3DES
1719 GEN_DES_ENC_CIPHER 5, rsp + _key_sched
1720 %else
1721 GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1722 %endif
1723 jmp %%_transpose_out
1724
1725 %%_blocks_6:
1726 ;; process six blocks and move to transpose out
1727 %ifnidn %%DES_DOCSIS, 3DES
1728 GEN_DES_ENC_CIPHER 6, rsp + _key_sched
1729 %else
1730 GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1731 %endif
1732 jmp %%_transpose_out
1733
1734 %%_blocks_7:
1735 ;; process seven blocks and move to transpose out
1736 %ifnidn %%DES_DOCSIS, 3DES
1737 GEN_DES_ENC_CIPHER 7, rsp + _key_sched
1738 %else
1739 GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1740 %endif
1741
1742 %%_transpose_out:
1743 ;; transpose data on output
1744 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1745
1746 ;; run masked stores
1747 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1748 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1749 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1750 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1751 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1752 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1753 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1754 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1755 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
1756 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
1757 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
1758 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
1759 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
1760 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
1761 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
1762 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
1763
1764 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1765 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1766 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1767 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1768 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1769 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1770 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1771 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1772 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
1773 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
1774 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
1775 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
1776 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
1777 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
1778 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
1779 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
1780 %%_gen_des_enc_part_end:
1781
1782 ;; store IV and update pointers
1783 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
1784
1785 ;; CFB part for DOCSIS
1786 %ifidn %%DES_DOCSIS, DOCSIS
1787 DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1788 %endif
1789
1790 CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
1791
1792 ;; restore stack pointer and registers
1793 mov r12, [rsp + _gpr_save + 0*8]
1794 mov r13, [rsp + _gpr_save + 1*8]
1795 mov r14, [rsp + _gpr_save + 2*8]
1796 mov r15, [rsp + _gpr_save + 3*8]
1797 mov rsp, [rsp + _rsp_save] ; original SP
1798 %endmacro
1799
1800 ;;; ===========================================================================
1801 ;;; DES CBC / DOCSIS DES DECRYPT
1802 ;;; ===========================================================================
1803 ;;;
1804 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1805 ;;; 3DES (3DES CBC)
1806 ;;;
1807 ;;; NOTE: clobbers OpMask registers
1808 %macro GENERIC_DES_DEC 1
1809 %define %%DES_DOCSIS %1
1810
1811 ;; push the registers and allocate the stack frame
1812 mov rax, rsp
1813 sub rsp, STACKFRAME_size
1814 and rsp, -64
1815 mov [rsp + _rsp_save], rax ; original SP
1816 mov [rsp + _gpr_save + 0*8], r12
1817 mov [rsp + _gpr_save + 1*8], r13
1818 mov [rsp + _gpr_save + 2*8], r14
1819 mov [rsp + _gpr_save + 3*8], r15
1820
1821 %ifnidn %%DES_DOCSIS, 3DES
1822 ;; DES and DOCSIS
1823 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1824 %else
1825 ;; 3DES
1826 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC
1827 %endif
1828
1829 ;; CFB part for DOCSIS
1830 %ifidn %%DES_DOCSIS, DOCSIS
1831 DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1832 %endif
1833
1834 mov [rsp + _size_save], SIZE
1835 and SIZE, -64
1836 xor OFFSET, OFFSET
1837 ;; This loop processes message in blocks of 64 bytes.
1838 ;; Anything smaller than 64 bytes is handled separately after the loop.
1839 %%_gen_des_dec_loop:
1840 cmp OFFSET, SIZE
1841 jz %%_gen_des_dec_loop_end
1842 ;; run loads
1843 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1844 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1845 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1846 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1847 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1848 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1849 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1850 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1851 vmovdqu64 ZW0, [IA0 + OFFSET]
1852 vmovdqu64 ZW1, [IA1 + OFFSET]
1853 vmovdqu64 ZW2, [IA2 + OFFSET]
1854 vmovdqu64 ZW3, [INP0 + OFFSET]
1855 vmovdqu64 ZW4, [INP1 + OFFSET]
1856 vmovdqu64 ZW5, [INP2 + OFFSET]
1857 vmovdqu64 ZW6, [INP3 + OFFSET]
1858 vmovdqu64 ZW7, [INP4 + OFFSET]
1859
1860 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1861 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1862 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1863 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1864 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1865 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1866 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1867 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1868 vmovdqu64 ZW8, [IA0 + OFFSET]
1869 vmovdqu64 ZW9, [IA1 + OFFSET]
1870 vmovdqu64 ZW10, [IA2 + OFFSET]
1871 vmovdqu64 ZW11, [INP0 + OFFSET]
1872 vmovdqu64 ZW12, [INP1 + OFFSET]
1873 vmovdqu64 ZW13, [INP2 + OFFSET]
1874 vmovdqu64 ZW14, [INP3 + OFFSET]
1875 vmovdqu64 ZW15, [INP4 + OFFSET]
1876
1877 ;; Transpose input
1878 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1879
1880 %ifnidn %%DES_DOCSIS, 3DES
1881 ;; DES CBC DEC comes here
1882 GEN_DES_DEC_CIPHER 8, rsp + _key_sched
1883 %else
1884 ;; 3DES CBC DEC comes here
1885 GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1886 %endif
1887
1888 ;; transpose data on output
1889 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1890
1891 ;; run stores
1892 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1893 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1894 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1895 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1896 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1897 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1898 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1899 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1900 vmovdqu64 [IA0 + OFFSET], ZW0
1901 vmovdqu64 [IA1 + OFFSET], ZW1
1902 vmovdqu64 [IA2 + OFFSET], ZW2
1903 vmovdqu64 [INP0 + OFFSET], ZW3
1904 vmovdqu64 [INP1 + OFFSET], ZW4
1905 vmovdqu64 [INP2 + OFFSET], ZW5
1906 vmovdqu64 [INP3 + OFFSET], ZW6
1907 vmovdqu64 [INP4 + OFFSET], ZW7
1908
1909 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1910 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1911 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1912 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1913 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1914 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1915 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1916 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1917 vmovdqu64 [IA0 + OFFSET], ZW8
1918 vmovdqu64 [IA1 + OFFSET], ZW9
1919 vmovdqu64 [IA2 + OFFSET], ZW10
1920 vmovdqu64 [INP0 + OFFSET], ZW11
1921 vmovdqu64 [INP1 + OFFSET], ZW12
1922 vmovdqu64 [INP2 + OFFSET], ZW13
1923 vmovdqu64 [INP3 + OFFSET], ZW14
1924 vmovdqu64 [INP4 + OFFSET], ZW15
1925
1926 add OFFSET, 64
1927 jmp %%_gen_des_dec_loop
1928 %%_gen_des_dec_loop_end:
1929 ;; This is where we check if there is anything less than 64 bytes
1930 ;; of message left for processing.
1931 mov SIZE, [rsp + _size_save]
1932 cmp OFFSET, SIZE
1933 jz %%_gen_des_dec_part_end
1934 ;; calculate min of bytes_left and 64, convert to qword mask
1935 GET_MASK8 IA0 ; IA0 = mask
1936
1937 kmovw k7, DWORD(IA0)
1938 mov [rsp + _mask_save], IA0
1939 ;; run masked loads
1940 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1941 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1942 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1943 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1944 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1945 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1946 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1947 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1948 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1949 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1950 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1951 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1952 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1953 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1954 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1955 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1956
1957 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1958 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1959 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1960 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1961 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1962 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1963 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1964 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1965 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1966 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1967 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1968 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1969 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1970 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1971 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1972 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1973
1974 ;; Transpose input
1975 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1976
1977 ;; DES CBC DEC comes here
1978 mov IA0, [rsp + _mask_save]
1979 cmp BYTE(IA0), 0x0f
1980 ja %%_gt_4
1981 jz %%_blocks_4
1982
1983 cmp BYTE(IA0), 0x03
1984 ja %%_blocks_3
1985 jz %%_blocks_2
1986 ;; process one block and move to transpose out
1987 %ifnidn %%DES_DOCSIS, 3DES
1988 GEN_DES_DEC_CIPHER 1, rsp + _key_sched
1989 %else
1990 GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1991 %endif
1992 jmp %%_transpose_out
1993
1994 %%_blocks_2:
1995 ;; process two blocks and move to transpose out
1996 %ifnidn %%DES_DOCSIS, 3DES
1997 GEN_DES_DEC_CIPHER 2, rsp + _key_sched
1998 %else
1999 GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2000 %endif
2001 jmp %%_transpose_out
2002
2003 %%_blocks_3:
2004 ;; process three blocks and move to transpose out
2005 %ifnidn %%DES_DOCSIS, 3DES
2006 GEN_DES_DEC_CIPHER 3, rsp + _key_sched
2007 %else
2008 GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2009 %endif
2010 jmp %%_transpose_out
2011
2012 %%_blocks_4:
2013 ;; process four blocks and move to transpose out
2014 %ifnidn %%DES_DOCSIS, 3DES
2015 GEN_DES_DEC_CIPHER 4, rsp + _key_sched
2016 %else
2017 GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2018 %endif
2019 jmp %%_transpose_out
2020
2021 %%_gt_4:
2022 cmp BYTE(IA0), 0x3f
2023 ja %%_blocks_7
2024 jz %%_blocks_6
2025 %%_blocks_5:
2026 ;; process five blocks and move to transpose out
2027 %ifnidn %%DES_DOCSIS, 3DES
2028 GEN_DES_DEC_CIPHER 5, rsp + _key_sched
2029 %else
2030 GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2031 %endif
2032 jmp %%_transpose_out
2033
2034 %%_blocks_6:
2035 ;; process six blocks and move to transpose out
2036 %ifnidn %%DES_DOCSIS, 3DES
2037 GEN_DES_DEC_CIPHER 6, rsp + _key_sched
2038 %else
2039 GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2040 %endif
2041 jmp %%_transpose_out
2042
2043 %%_blocks_7:
2044 ;; process seven blocks and move to transpose out
2045 %ifnidn %%DES_DOCSIS, 3DES
2046 GEN_DES_DEC_CIPHER 7, rsp + _key_sched
2047 %else
2048 GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2049 %endif
2050
2051 %%_transpose_out:
2052 ;; transpose data on output
2053 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
2054
2055 ;; run masked stores
2056 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
2057 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
2058 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
2059 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
2060 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
2061 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
2062 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
2063 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
2064 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
2065 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
2066 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
2067 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
2068 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
2069 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
2070 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
2071 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
2072
2073 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
2074 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
2075 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
2076 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
2077 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
2078 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
2079 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
2080 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
2081 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
2082 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
2083 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
2084 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
2085 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
2086 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
2087 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
2088 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
2089 %%_gen_des_dec_part_end:
2090
2091 ;; store IV and update pointers
2092 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
2093
2094 CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
2095
2096 ;; restore stack pointer and registers
2097 mov r12, [rsp + _gpr_save + 0*8]
2098 mov r13, [rsp + _gpr_save + 1*8]
2099 mov r14, [rsp + _gpr_save + 2*8]
2100 mov r15, [rsp + _gpr_save + 3*8]
2101 mov rsp, [rsp + _rsp_save] ; original SP
2102 %endmacro
2103
2104
2105 ;;; ========================================================
2106 ;;; DATA
2107
2108 section .data
2109 default rel
2110 align 64
2111 mask_values:
2112 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2113 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2114 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2115 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2116 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2117 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2118 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2119 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2120 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2121 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2122 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2123 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2124 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2125 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2126 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2127 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2128 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2129 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2130 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2131 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2132 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2133 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2134 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2135 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2136 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2137 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2138 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2139 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2140 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2141 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2142 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2143 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2144 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2145 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2146 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2147 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2148 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2149 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2150 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2151 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2152 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2153 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2154 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2155 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2156 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2157 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2158 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2159 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2160 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2161 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2162 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2163 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2164 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2165 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2166 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2167 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2168 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2169 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2170 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2171 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2172 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2173 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2174 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2175 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2176 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2177 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2178 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2179 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2180 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2181 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2182 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2183 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2184 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2185 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2186 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2187 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2188
2189 align 64
2190 init_perm_consts:
2191 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2192 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2193 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2194 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2195 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2196 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2197 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2198 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2199 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2200 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2201 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2202 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2203 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2204 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2205 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2206 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2207 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2208 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2209 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2210 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2211
2212 ;;; S-Box table
2213 align 64
2214 S_box_flipped:
2215 ;; SBOX0
2216 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c
2217 dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a
2218 dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05
2219 dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00
2220 dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05
2221 dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06
2222 dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00
2223 dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b
2224 ;; SBOX1
2225 dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09
2226 dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04
2227 dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c
2228 dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f
2229 dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00
2230 dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07
2231 dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a
2232 dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09
2233 ;; SBOX2
2234 dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a
2235 dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07
2236 dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05
2237 dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e
2238 dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d
2239 dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04
2240 dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a
2241 dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03
2242 ;; SBOX3
2243 dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a
2244 dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01
2245 dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04
2246 dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02
2247 dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03
2248 dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04
2249 dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e
2250 dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07
2251 ;; SBOX4
2252 dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06
2253 dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00
2254 dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c
2255 dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07
2256 dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05
2257 dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a
2258 dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02
2259 dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c
2260 ;; SBOX5
2261 dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08
2262 dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d
2263 dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b
2264 dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06
2265 dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06
2266 dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01
2267 dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00
2268 dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b
2269 ;; SBOX6
2270 dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00
2271 dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09
2272 dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a
2273 dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04
2274 dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07
2275 dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c
2276 dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04
2277 dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03
2278 ;; SBOX7
2279 dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f
2280 dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a
2281 dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c
2282 dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01
2283 dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c
2284 dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06
2285 dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a
2286 dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d
2287
2288 ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1
2289 align 64
2290 vec_ones_32b:
2291 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
2292
2293 align 64
2294 and_eu:
2295 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2296 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2297 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2298 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2299
2300 align 64
2301 and_ed:
2302 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2303 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2304 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2305 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2306
2307 align 64
2308 idx_e:
2309 dq 0x0d0c090805040100, 0x0f0e0b0a07060302
2310 dq 0x1d1c191815141110, 0x1f1e1b1a17161312
2311 dq 0x2d2c292825242120, 0x2f2e2b2a27262322
2312 dq 0x3d3c393835343130, 0x3f3e3b3a37363332
2313
2314 align 64
2315 reg_values16bit_7:
2316 dq 0x001f001f001f001f, 0x001f001f001f001f
2317 dq 0x001f001f001f001f, 0x001f001f001f001f
2318 dq 0x001f001f001f001f, 0x001f001f001f001f
2319 dq 0x001f001f001f001f, 0x001f001f001f001f
2320
2321 align 64
2322 shuffle_reg:
2323 dq 0x0705060403010200, 0x0f0d0e0c0b090a08
2324 dq 0x1715161413111210, 0x1f1d1e1c1b191a18
2325 dq 0x2725262423212220, 0x2f2d2e2c2b292a28
2326 dq 0x3735363433313230, 0x3f3d3e3c3b393a38
2327
2328 ;;; ========================================================
2329 ;;; CODE
2330 section .text
2331
2332 ;;; arg 1 : pointer to DES OOO structure
2333 ;;; arg 2 : size in bytes
2334 align 64
2335 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal)
2336 des_x16_cbc_enc_avx512:
2337 GENERIC_DES_ENC DES
2338 ret
2339
2340 ;;; arg 1 : pointer to DES OOO structure
2341 ;;; arg 2 : size in bytes
2342 align 64
2343 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal)
2344 des_x16_cbc_dec_avx512:
2345 GENERIC_DES_DEC DES
2346 ret
2347
2348 ;;; arg 1 : pointer to DES OOO structure
2349 ;;; arg 2 : size in bytes
2350 align 64
2351 MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal)
2352 des3_x16_cbc_enc_avx512:
2353 GENERIC_DES_ENC 3DES
2354 ret
2355
2356 ;;; arg 1 : pointer to DES OOO structure
2357 ;;; arg 2 : size in bytes
2358 align 64
2359 MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal)
2360 des3_x16_cbc_dec_avx512:
2361 GENERIC_DES_DEC 3DES
2362 ret
2363
2364 ;;; arg 1 : pointer to DES OOO structure
2365 ;;; arg 2 : size in bytes
2366 align 64
2367 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal)
2368 docsis_des_x16_enc_avx512:
2369 GENERIC_DES_ENC DOCSIS
2370 ret
2371
2372 ;;; arg 1 : pointer to DES OOO structure
2373 ;;; arg 2 : size in bytes
2374 align 64
2375 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal)
2376 docsis_des_x16_dec_avx512:
2377 GENERIC_DES_DEC DOCSIS
2378 ret
2379
2380 %ifdef LINUX
2381 section .note.GNU-stack noalloc noexec nowrite progbits
2382 %endif