2 ;; Copyright (c) 2017-2019, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 ;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2)
30 ;; (1) University of Haifa, Israel
31 ;; (2) Intel Corporation
33 ;; In System V AMD64 ABI
34 ;; calle saves: RBX, RBP, R12-R15
36 ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
39 ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
40 ;; -----------------------------------------------------------
41 ;; Windows clobbers: RAX R8 R9 R10 R11
42 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
43 ;; -----------------------------------------------------------
44 ;; Linux clobbers: RAX RCX RDX R10 R11
45 ;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
46 ;; -----------------------------------------------------------
47 ;; Clobbers ZMM0-31 and K1 to K7
49 %include "include/os.asm"
50 %include "include/reg_sizes.asm"
51 %include "mb_mgr_datastruct.asm"
52 %include "constants.asm"
54 %include "include/dbgprint.asm"
121 _key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
122 _key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
123 _key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
124 _tmp_iv: resq 16 ; 2 x 64 bytes
125 _tmp_in: resq 16 ; 2 x 64 bytes
126 _tmp_out: resq 16 ; 2 x 64 bytes
127 _tmp_mask: resd 16 ; 1 x 64 bytes
128 _gpr_save: resq 4 ; r12 to r15
134 ;;; ===========================================================================
135 ;;; ===========================================================================
137 ;;; ===========================================================================
138 ;;; ===========================================================================
140 ;;; ===========================================================================
141 ;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected)
142 ;;; ===========================================================================
143 %macro CLEAR_KEY_SCHEDULE 2
144 %define %%ALG %1 ; [in] DES or 3DES
145 %define %%ZT %2 ; [clobbered] temporary ZMM register
149 %assign rep_num (2048 / 64)
151 %assign rep_num (rep_num * 3)
156 vmovdqa64 [rsp + _key_sched + offset], %%ZT
157 %assign offset (offset + 64)
164 ;;; ===========================================================================
166 ;;; ===========================================================================
167 ;;; A [in/out] - zmm register
168 ;;; B [in/out] - zmm register
169 ;;; NSHIFT [in] - constant to shift words by
170 ;;; MASK [in] - zmm or m512 with mask
171 ;;; T0 [clobbered] - temporary zmm register
179 vpsrld %%T0, %%A, %%NSHIFT
180 vpxord %%T0, %%T0, %%B
181 vpandd %%T0, %%T0, %%MASK
182 vpxord %%B, %%B, %%T0
183 vpslld %%T0, %%T0, %%NSHIFT
184 vpxord %%A, %%A, %%T0
187 ;;; ===========================================================================
188 ;;; INITIAL PERMUTATION
189 ;;; ===========================================================================
190 ;;; L [in/out] - zmm register
191 ;;; R [in/out] - zmm register
192 ;;; T0 [clobbered] - temporary zmm register
197 PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0
198 PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0
199 PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0
200 PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0
201 PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0
204 ;;; ===========================================================================
205 ;;; FINAL PERMUTATION
206 ;;; ===========================================================================
207 ;;; L [in/out] - zmm register
208 ;;; R [in/out] - zmm register
209 ;;; T0 [clobbered] - temporary zmm register
214 PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0
215 PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0
216 PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0
217 PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0
218 PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0
221 ;;; ===========================================================================
223 ;;; ===========================================================================
224 ;;; W0 [in/out] - zmm register
225 ;;; in: vector of 16 x 32bits from S phase
226 ;;; out: permuted in vector
227 ;;; T0-T3 [clobbered] - temporary zmm register
236 vpandd %%T0, %%T0, [rel mask_values + 0*64]
238 vpandd %%T1, %%T1, [rel mask_values + 1*64]
239 vpord %%T0, %%T0, %%T1
241 vprord %%T1, %%W0, 24
242 vpandd %%T1, %%T1, [rel mask_values + 2*64]
243 vprord %%T2, %%W0, 26
244 vpandd %%T2, %%T2, [rel mask_values + 3*64]
245 vpord %%T1, %%T1, %%T2
246 vpord %%T0, %%T0, %%T1
248 vprord %%T1, %%W0, 15
249 vpandd %%T1, %%T1, [rel mask_values + 4*64]
250 vprord %%T2, %%W0, 17
251 vpandd %%T2, %%T2, [rel mask_values + 5*64]
252 vpord %%T1, %%T1, %%T2
255 vpandd %%T2, %%T2, [rel mask_values + 6*64]
256 vprord %%T3, %%W0, 21
257 vpandd %%T3, %%T3, [rel mask_values + 7*64]
258 vpord %%T2, %%T2, %%T3
259 vpord %%T1, %%T1, %%T2
260 vpord %%T0, %%T0, %%T1
262 vprord %%T1, %%W0, 12
263 vpandd %%T1, %%T1, [rel mask_values + 8*64]
264 vprord %%T2, %%W0, 14
265 vpandd %%T2, %%T2, [rel mask_values + 9*64]
266 vpord %%T1, %%T1, %%T2
269 vpandd %%T2, %%T2, [rel mask_values + 10*64]
270 vprord %%T3, %%W0, 11
271 vpandd %%T3, %%T3, [rel mask_values + 11*64]
272 vpord %%T2, %%T2, %%T3
273 vpord %%T1, %%T1, %%T2
274 vpord %%T0, %%T0, %%T1
276 vprord %%T1, %%W0, 16
277 vpandd %%T1, %%T1, [rel mask_values + 12*64]
278 vprord %%T2, %%W0, 22
279 vpandd %%T2, %%T2, [rel mask_values + 13*64]
280 vpord %%T1, %%T1, %%T2
282 vprord %%T2, %%W0, 19
283 vpandd %%T2, %%T2, [rel mask_values + 14*64]
284 vprord %%T3, %%W0, 10
285 vpandd %%T3, %%T3, [rel mask_values + 15*64]
286 vpord %%T2, %%T2, %%T3
287 vpord %%T1, %%T1, %%T2
288 vpord %%T0, %%T0, %%T1
291 vpandd %%T1, %%T1, [rel mask_values + 16*64]
292 vprord %%T2, %%W0, 13
293 vpandd %%T2, %%T2, [rel mask_values + 17*64]
294 vpord %%T1, %%T1, %%T2
296 vprord %%T2, %%W0, 25
297 vpandd %%T2, %%T2, [rel mask_values + 18*64]
298 vpord %%T1, %%T1, %%T2
299 vpord %%W0, %%T0, %%T1
302 ;;; ===========================================================================
304 ;;; ===========================================================================
306 ;;; Expands 16x32-bit words into 16x48-bit words
307 ;;; plus XOR's result with the key schedule.
308 ;;; The output is adjusted to be friendly as S phase input.
310 ;;; in [in] - zmm register
311 ;;; out0a [out] - zmm register
312 ;;; out0b [out] - zmm register
313 ;;; out1a [out] - zmm register
314 ;;; out1b [out] - zmm register
315 ;;; k0 [in] - key schedule; zmm or m512
316 ;;; k1 [in] - key schedule; zmm or m512
317 ;;; t0-t1 [clobbered] - temporary zmm register
329 vprord %%T0, %%IN, 31
331 vpshufb %%T0, %%T0, [rel idx_e]
332 vpshufb %%T1, %%T1, [rel idx_e]
333 vpunpcklbw %%OUT0A, %%T0, %%T1
334 vpunpckhbw %%OUT1A, %%T0, %%T1
335 vpxord %%OUT0A, %%OUT0A, %%K0
336 vpxord %%OUT1A, %%OUT1A, %%K1
337 vpandd %%OUT0B, %%OUT0A, [rel and_eu]
338 vpsrlw %%OUT0B, %%OUT0B, 8
339 vpandd %%OUT0A, %%OUT0A, [rel and_ed]
340 vpandd %%OUT1B, %%OUT1A, [rel and_eu]
341 vpsrlw %%OUT1B, %%OUT1B, 8
342 vpandd %%OUT1A, %%OUT1A, [rel and_ed]
345 ;;; ===========================================================================
347 ;;; ===========================================================================
349 ;;; NOTE: clobbers k1-k6 OpMask registers
351 ;;; IN0A [in] - zmm register; output from E-phase
352 ;;; IN0B [in] - zmm register; output from E-phase
353 ;;; IN1A [in] - zmm register; output from E-phase
354 ;;; IN1B [in] - zmm register; output from E-phase
355 ;;; OUT [out] - zmm register; output from E-phase
356 ;;; T0-T5 [clobbered] - temporary zmm register
370 vmovdqa64 %%T0, [rel reg_values16bit_7]
371 vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE
372 vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE
373 vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE
374 vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE
376 mov DWORD(IA0), 0x55555555
378 mov DWORD(IA0), 0xaaaaaaaa
381 vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64]
382 vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64]
383 vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64]
384 vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64]
385 vpxord %%T0, %%T0, %%T2
386 vpxord %%OUT, %%T1, %%T3
387 vmovdqu16 %%OUT{k3}, %%T0
389 vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64]
390 vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64]
391 vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64]
392 vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64]
393 vpxord %%T0, %%T0, %%T2
394 vpxord %%T3, %%T1, %%T3
395 vmovdqu16 %%T3{k4}, %%T0
397 vpxord %%OUT, %%OUT, %%T3
399 vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64]
400 vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64]
401 vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64]
402 vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64]
403 vpxord %%T0, %%T0, %%T2
404 vpxord %%T4, %%T1, %%T3
405 vmovdqu16 %%T4{k5}, %%T0
407 vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64]
408 vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64]
409 vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64]
410 vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64]
411 vpxord %%T0, %%T0, %%T2
412 vpxord %%T5, %%T1, %%T3
413 vmovdqu16 %%T5{k6}, %%T0
416 vpxord %%T4, %%T4, %%T5
418 vpxord %%OUT, %%OUT, %%T4
419 vpshufb %%OUT, %%OUT, [rel shuffle_reg]
422 ;;; ===========================================================================
423 ;;; DES encryption/decryption round
424 ;;; ===========================================================================
426 ;;; Clobbers k1-k6 OpMask registers
428 ;;; ENC_DEC [in] - ENC for encryption, DEC for decryption
429 ;;; R [in/out] - zmm register; plain text in & cipher text out
430 ;;; L [in/out] - zmm register; plain text in & cipher text out
431 ;;; KS [in] - pointer to the key schedule
432 ;;; T0-T11 [clobbered] - temporary zmm register
433 %macro DES_ENC_DEC 16
453 %ifidn %%ENC_DEC, ENC
455 xor KSOFFSET, KSOFFSET
457 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7
458 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
459 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
460 vpxord %%L, %%L, %%T0
462 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7
463 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
464 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
465 vpxord %%R, %%R, %%T0
468 cmp KSOFFSET, (8*(4*64))
473 mov KSOFFSET, (8*(4*64))
475 E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7
476 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
477 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
478 vpxord %%L, %%L, %%T0
480 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7
481 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
482 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
483 vpxord %%R, %%R, %%T0
491 ;;; ===========================================================================
492 ;;; DATA TRANSPOSITION AT DATA INPUT
493 ;;; ===========================================================================
495 ;;; IN00 - IN15 [in/out]:
496 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
497 ;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
498 ;;; T0-T3 [clobbered] - temporary zmm registers
499 ;;; K0-K5 [clobbered] - temporary zmm registers
500 ;;; H0-H3 [clobbered] - temporary zmm registers
501 %macro TRANSPOSE_IN 30
502 %define %%IN00 %1 ; R0
503 %define %%IN01 %2 ; L0
504 %define %%IN02 %3 ; R1
505 %define %%IN03 %4 ; L1
506 %define %%IN04 %5 ; R2
507 %define %%IN05 %6 ; L2
508 %define %%IN06 %7 ; R3
509 %define %%IN07 %8 ; L3
510 %define %%IN08 %9 ; R4
511 %define %%IN09 %10 ; L4
512 %define %%IN10 %11 ; R5
513 %define %%IN11 %12 ; L5
514 %define %%IN12 %13 ; R6
515 %define %%IN13 %14 ; L6
516 %define %%IN14 %15 ; R7
517 %define %%IN15 %16 ; L7
533 vpunpckldq %%K0, %%IN00, %%IN01
534 vpunpckhdq %%K1, %%IN00, %%IN01
535 vpunpckldq %%T0, %%IN02, %%IN03
536 vpunpckhdq %%T1, %%IN02, %%IN03
538 vpunpckldq %%IN00, %%IN04, %%IN05
539 vpunpckhdq %%IN01, %%IN04, %%IN05
540 vpunpckldq %%IN02, %%IN06, %%IN07
541 vpunpckhdq %%IN03, %%IN06, %%IN07
543 vpunpcklqdq %%K2, %%K0, %%T0
544 vpunpckhqdq %%T2, %%K0, %%T0
545 vpunpcklqdq %%K3, %%K1, %%T1
546 vpunpckhqdq %%T3, %%K1, %%T1
548 vpunpcklqdq %%K0, %%IN00, %%IN02
549 vpunpckhqdq %%K1, %%IN00, %%IN02
550 vpunpcklqdq %%T0, %%IN01, %%IN03
551 vpunpckhqdq %%T1, %%IN01, %%IN03
553 vpunpckldq %%K4, %%IN08, %%IN09
554 vpunpckhdq %%K5, %%IN08, %%IN09
555 vpunpckldq %%IN04, %%IN10, %%IN11
556 vpunpckhdq %%IN05, %%IN10, %%IN11
557 vpunpckldq %%IN06, %%IN12, %%IN13
558 vpunpckhdq %%IN07, %%IN12, %%IN13
559 vpunpckldq %%IN10, %%IN14, %%IN15
560 vpunpckhdq %%IN11, %%IN14, %%IN15
562 vpunpcklqdq %%IN12, %%K4, %%IN04
563 vpunpckhqdq %%IN13, %%K4, %%IN04
564 vpunpcklqdq %%IN14, %%K5, %%IN05
565 vpunpckhqdq %%IN15, %%K5, %%IN05
566 vpunpcklqdq %%IN00, %%IN06, %%IN10
567 vpunpckhqdq %%IN01, %%IN06, %%IN10
568 vpunpcklqdq %%IN02, %%IN07, %%IN11
569 vpunpckhqdq %%IN03, %%IN07, %%IN11
571 vshufi64x2 %%H0, %%K2, %%K0, 0x44
572 vshufi64x2 %%H1, %%K2, %%K0, 0xee
573 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
574 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
575 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
576 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
577 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
578 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
580 vshufi64x2 %%H0, %%T2, %%K1, 0x44
581 vshufi64x2 %%H1, %%T2, %%K1, 0xee
582 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
583 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
584 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
585 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
586 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
587 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
589 vshufi64x2 %%H0, %%K3, %%T0, 0x44
590 vshufi64x2 %%H1, %%K3, %%T0, 0xee
591 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
592 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
593 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
594 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
595 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
596 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
598 vshufi64x2 %%H0, %%T3, %%T1, 0x44
599 vshufi64x2 %%H1, %%T3, %%T1, 0xee
600 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
601 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
602 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
603 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
604 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
605 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
608 ;;; ===========================================================================
609 ;;; DATA TRANSPOSITION AT DATA OUTPUT
610 ;;; ===========================================================================
612 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
613 ;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
614 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
615 ;;; T0-T3 [clobbered] - temporary zmm registers
616 ;;; K0-K5 [clobbered] - temporary zmm registers
617 ;;; H0-H3 [clobbered] - temporary zmm registers
618 %macro TRANSPOSE_OUT 30
619 %define %%IN00 %1 ; R0
620 %define %%IN01 %2 ; L0
621 %define %%IN02 %3 ; R1
622 %define %%IN03 %4 ; L1
623 %define %%IN04 %5 ; R2
624 %define %%IN05 %6 ; L2
625 %define %%IN06 %7 ; R3
626 %define %%IN07 %8 ; L3
627 %define %%IN08 %9 ; R4
628 %define %%IN09 %10 ; L4
629 %define %%IN10 %11 ; R5
630 %define %%IN11 %12 ; L5
631 %define %%IN12 %13 ; R6
632 %define %%IN13 %14 ; L6
633 %define %%IN14 %15 ; R7
634 %define %%IN15 %16 ; L7
650 vpunpckldq %%K0, %%IN01, %%IN00
651 vpunpckhdq %%K1, %%IN01, %%IN00
652 vpunpckldq %%T0, %%IN03, %%IN02
653 vpunpckhdq %%T1, %%IN03, %%IN02
655 vpunpckldq %%IN00, %%IN05, %%IN04
656 vpunpckhdq %%IN01, %%IN05, %%IN04
657 vpunpckldq %%IN02, %%IN07, %%IN06
658 vpunpckhdq %%IN03, %%IN07, %%IN06
660 vpunpcklqdq %%K2, %%K0, %%T0
661 vpunpckhqdq %%T2, %%K0, %%T0
662 vpunpcklqdq %%K3, %%K1, %%T1
663 vpunpckhqdq %%T3, %%K1, %%T1
665 vpunpcklqdq %%K0, %%IN00, %%IN02
666 vpunpckhqdq %%K1, %%IN00, %%IN02
667 vpunpcklqdq %%T0, %%IN01, %%IN03
668 vpunpckhqdq %%T1, %%IN01, %%IN03
670 vpunpckldq %%K4, %%IN09, %%IN08
671 vpunpckhdq %%K5, %%IN09, %%IN08
672 vpunpckldq %%IN04, %%IN11, %%IN10
673 vpunpckhdq %%IN05, %%IN11, %%IN10
674 vpunpckldq %%IN06, %%IN13, %%IN12
675 vpunpckhdq %%IN07, %%IN13, %%IN12
676 vpunpckldq %%IN10, %%IN15, %%IN14
677 vpunpckhdq %%IN11, %%IN15, %%IN14
679 vpunpcklqdq %%IN12, %%K4, %%IN04
680 vpunpckhqdq %%IN13, %%K4, %%IN04
681 vpunpcklqdq %%IN14, %%K5, %%IN05
682 vpunpckhqdq %%IN15, %%K5, %%IN05
683 vpunpcklqdq %%IN00, %%IN06, %%IN10
684 vpunpckhqdq %%IN01, %%IN06, %%IN10
685 vpunpcklqdq %%IN02, %%IN07, %%IN11
686 vpunpckhqdq %%IN03, %%IN07, %%IN11
688 vshufi64x2 %%H0, %%K2, %%K0, 0x44
689 vshufi64x2 %%H1, %%K2, %%K0, 0xee
690 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
691 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
692 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
693 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
694 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
695 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
697 vshufi64x2 %%H0, %%T2, %%K1, 0x44
698 vshufi64x2 %%H1, %%T2, %%K1, 0xee
699 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
700 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
701 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
702 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
703 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
704 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
706 vshufi64x2 %%H0, %%K3, %%T0, 0x44
707 vshufi64x2 %%H1, %%K3, %%T0, 0xee
708 vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
709 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
710 vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
711 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
712 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
713 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
715 vshufi64x2 %%H0, %%T3, %%T1, 0x44
716 vshufi64x2 %%H1, %%T3, %%T1, 0xee
717 vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
718 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
719 vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
720 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
721 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
722 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
725 ;;; ===========================================================================
726 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT
727 ;;; ===========================================================================
729 ;;; IN00-IN15 / R0/L0-R7/L7 [in/out]:
730 ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
731 ;;; out: R0 - 16 x word0, L0 - 16 x word1
732 ;;; T0,T2 [clobbered] - temporary zmm registers
733 ;;; K0-K4 [clobbered] - temporary zmm registers
734 ;;; H0,H2 [clobbered] - temporary zmm registers
735 %macro TRANSPOSE_IN_ONE 24
736 %define %%IN00 %1 ; R0
737 %define %%IN01 %2 ; L0
738 %define %%IN02 %3 ; R1
739 %define %%IN03 %4 ; L1
740 %define %%IN04 %5 ; R2
741 %define %%IN05 %6 ; L2
742 %define %%IN06 %7 ; R3
743 %define %%IN07 %8 ; L3
744 %define %%IN08 %9 ; R4
745 %define %%IN09 %10 ; L4
746 %define %%IN10 %11 ; R5
747 %define %%IN11 %12 ; L5
748 %define %%IN12 %13 ; R6
749 %define %%IN13 %14 ; L6
750 %define %%IN14 %15 ; R7
751 %define %%IN15 %16 ; L7
761 vpunpckldq %%K0, %%IN00, %%IN01
762 vpunpckhdq %%K1, %%IN00, %%IN01
763 vpunpckldq %%T0, %%IN02, %%IN03
765 vpunpckldq %%IN00, %%IN04, %%IN05
766 vpunpckhdq %%IN01, %%IN04, %%IN05
767 vpunpckldq %%IN02, %%IN06, %%IN07
769 vpunpcklqdq %%K2, %%K0, %%T0
770 vpunpckhqdq %%T2, %%K0, %%T0
772 vpunpcklqdq %%K0, %%IN00, %%IN02
773 vpunpckhqdq %%K1, %%IN00, %%IN02
775 vpunpckldq %%K4, %%IN08, %%IN09
776 vpunpckldq %%IN04, %%IN10, %%IN11
777 vpunpckldq %%IN06, %%IN12, %%IN13
778 vpunpckldq %%IN10, %%IN14, %%IN15
780 vpunpcklqdq %%IN12, %%K4, %%IN04
781 vpunpckhqdq %%IN13, %%K4, %%IN04
782 vpunpcklqdq %%IN00, %%IN06, %%IN10
783 vpunpckhqdq %%IN01, %%IN06, %%IN10
785 vshufi64x2 %%H0, %%K2, %%K0, 0x44
786 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
787 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
789 vshufi64x2 %%H0, %%T2, %%K1, 0x44
790 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
791 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
794 ;;; ===========================================================================
795 ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT
796 ;;; ===========================================================================
798 ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
799 ;;; in: R0 - 16 x word0, L0 - 16 x word1
800 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
801 ;;; T0-T3 [clobbered] - temporary zmm registers
802 ;;; K0-K3 [clobbered] - temporary zmm registers
803 ;;; H0,H1 [clobbered] - temporary zmm registers
804 %macro TRANSPOSE_OUT_ONE 25
805 %define %%IN00 %1 ; R0
806 %define %%IN01 %2 ; L0
807 %define %%IN02 %3 ; R1
808 %define %%IN03 %4 ; L1
809 %define %%IN04 %5 ; R2
810 %define %%IN05 %6 ; L2
811 %define %%IN06 %7 ; R3
812 %define %%IN07 %8 ; L3
813 %define %%IN08 %9 ; R4
814 %define %%IN09 %10 ; L4
815 %define %%IN10 %11 ; R5
816 %define %%IN11 %12 ; L5
817 %define %%IN12 %13 ; R6
818 %define %%IN13 %14 ; L6
819 %define %%IN14 %15 ; R7
820 %define %%IN15 %16 ; L7
831 vpxord %%T0, %%T0, %%T0
833 vpunpckldq %%K0, %%IN01, %%IN00
834 vpunpckhdq %%K1, %%IN01, %%IN00
836 vpunpcklqdq %%K2, %%K0, %%T0
837 vpunpckhqdq %%T2, %%K0, %%T0
838 vpunpcklqdq %%K3, %%K1, %%T0
839 vpunpckhqdq %%T3, %%K1, %%T0
841 vshufi64x2 %%H0, %%K2, %%T0, 0x44
842 vshufi64x2 %%H1, %%K2, %%T0, 0xee
843 vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0
844 vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2
845 vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4
846 vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6
848 vshufi64x2 %%H0, %%T2, %%T0, 0x44
849 vshufi64x2 %%H1, %%T2, %%T0, 0xee
850 vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0
851 vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2
852 vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4
853 vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6
855 vshufi64x2 %%H0, %%K3, %%T0, 0x44
856 vshufi64x2 %%H1, %%K3, %%T0, 0xee
857 vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1
858 vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3
859 vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5
860 vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7
862 vshufi64x2 %%H0, %%T3, %%T0, 0x44
863 vshufi64x2 %%H1, %%T3, %%T0, 0xee
864 vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1
865 vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3
866 vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5
867 vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7
870 ;;; ===========================================================================
871 ;;; DES INITIALIZATION
872 ;;; key schedule transposition and IV set up
873 ;;; ===========================================================================
875 ;;; STATE_KEYS [in] - KEYS in DES OOO STATE
876 ;;; STATE_IV [ in] - IV in DES OOO STATE
877 ;;; KS [out] - place to store transposed key schedule or NULL
878 ;;; IV0 [out] - r512; initialization vector
879 ;;; IV1 [out] - r512; initialization vector
880 ;;; T0-T27 [clobbered] - temporary r512
882 %define %%STATE_KEYS %1
883 %define %%STATE_IV %2
916 ;; set up the key schedule
917 ;; - load first half of the keys & transpose
918 ;; - transpose and store
919 ;; note: we can use IV registers as temprary ones here
922 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
923 vmovdqu64 %%T %+ IDX, [IA0]
924 %assign IDX (IDX + 1)
926 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
929 vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX
930 %assign IDX (IDX + 1)
932 ;; - load second half of the keys & transpose
933 ;; - transpose and store
934 ;; note: we can use IV registers as temprary ones here
937 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
938 vmovdqu64 %%T %+ IDX, [IA0 + 64]
939 %assign IDX (IDX + 1)
941 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
944 vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX
945 %assign IDX (IDX + 1)
949 ;; - they are already kept transposed so this is enough to load them
950 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
951 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
954 ;;; ===========================================================================
955 ;;; 3DES INITIALIZATION
956 ;;; key schedule transposition and IV set up
957 ;;; ===========================================================================
959 ;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE
960 ;;; STATE_IV [ in] - IV in 3DES OOO STATE
961 ;;; KS1 [out] - place to store transposed key schedule or NULL
962 ;;; KS2 [out] - place to store transposed key schedule or NULL
963 ;;; KS3 [out] - place to store transposed key schedule or NULL
964 ;;; IV0 [out] - r512; initialization vector
965 ;;; IV1 [out] - r512; initialization vector
966 ;;; T0-T27 [clobbered] - temporary r512
967 ;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec)
969 %define %%STATE_KEYS %1
970 %define %%STATE_IV %2
1014 ;; set up the key schedule
1015 ;; - load first half of the keys & transpose
1016 ;; - transpose and store
1017 ;; note: we can use IV registers as temprary ones here
1021 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
1022 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
1023 vmovdqu64 %%T %+ IDX, [IA0]
1024 %assign IDX (IDX + 1)
1026 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1029 vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX
1030 %assign IDX (IDX + 1)
1032 ;; - load second half of the keys & transpose
1033 ;; - transpose and store
1034 ;; note: we can use IV registers as temprary ones here
1037 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
1038 mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
1039 vmovdqu64 %%T %+ IDX, [IA0 + 64]
1040 %assign IDX (IDX + 1)
1042 TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
1045 vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX
1046 %assign IDX (IDX + 1)
1050 %assign KEY_IDX (KEY_IDX + 1)
1052 %assign KEY_IDX (KEY_IDX - 1)
1054 %assign KS_IDX (KS_IDX + 1)
1055 %endrep ; KEY_IDX / KS_IDX
1058 ;; - they are already kept transposed so this is enough to load them
1059 vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
1060 vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
1064 ;;; ===========================================================================
1066 ;;; Update in/out pointers and store IV
1067 ;;; ===========================================================================
1069 ;;; Needs: STATE & SIZE
1070 ;;; IV0 [in] - r512; initialization vector
1071 ;;; IV1 [in] - r512; initialization vector
1072 ;;; T0-T4 [clobbered] - temporary r512 registers
1082 vpbroadcastq %%T4, SIZE
1083 vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)]
1084 vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)]
1085 vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)]
1086 vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)]
1087 vpaddq %%T0, %%T0, %%T4
1088 vpaddq %%T1, %%T1, %%T4
1089 vpaddq %%T2, %%T2, %%T4
1090 vpaddq %%T3, %%T3, %%T4
1091 vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0
1092 vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1
1093 vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2
1094 vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3
1096 vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0
1097 vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1
1100 ;;; ===========================================================================
1101 ;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY
1102 ;;; ===========================================================================
1104 ;;; Needs: STATE, IA0-IA2
1105 ;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection
1106 ;;; KS [in] - key schedule
1107 ;;; T0-T24 [clobbered] - temporary r512
1108 ;;; T_IN [in] - 16 * 8 byte storage
1109 ;;; T_OUT [in] - 16 * 8 byte storage
1110 ;;; T_MASK [in] - 16 * 4 byte storage
1111 ;;; T_IV [in] - 16 * 8 byte storage
1113 ;;; NOTE: clobbers OpMask registers
1114 %macro DES_CFB_ONE 31
1115 %define %%ENC_DEC %1
1145 %define %%T_MASK %31
1147 ;; - find mask for non-zero partial lengths
1148 vpxord %%T10, %%T10, %%T10
1149 vmovdqu64 %%T0, [STATE + _des_args_PLen]
1150 vpcmpd k3, %%T0, %%T10, 4 ; NEQ
1151 kmovw DWORD(IA0), k3
1152 movzx DWORD(IA0), WORD(IA0)
1153 or DWORD(IA0), DWORD(IA0)
1154 jz %%_des_cfb_one_end ; no non-zero partial lengths
1156 %ifidn %%ENC_DEC, ENC
1157 ;; For encyrption case we need to make sure that
1158 ;; all full blocks are complete before proceeding
1159 ;; with CFB partial block.
1160 ;; To do that current out position is compared against
1161 ;; calculated last full block position.
1162 vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)]
1163 vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)]
1164 vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)]
1165 vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)]
1166 vpcmpq k4, %%T1, %%T2, 0 ; EQ
1167 vpcmpq k5, %%T3, %%T4, 0 ; EQ
1168 kmovw DWORD(IA1), k4
1169 movzx DWORD(IA1), BYTE(IA1)
1170 kmovw DWORD(IA2), k5
1171 movzx DWORD(IA2), BYTE(IA2)
1173 or DWORD(IA2), DWORD(IA1)
1174 and DWORD(IA0), DWORD(IA2)
1175 jz %%_des_cfb_one_end ; no non-zero lengths left
1176 kmovw k3, DWORD(IA0)
1178 ;; Calculate ((1 << partial_bytes) - 1)
1179 ;; in order to get the mask for loads and stores
1180 ;; k3 & IA0 - hold valid mask
1181 vmovdqa64 %%T1, [rel vec_ones_32b]
1182 vpsllvd %%T2{k3}{z}, %%T1, %%T0
1183 vpsubd %%T2{k3}{z}, %%T2, %%T1
1184 vmovdqu64 [%%T_MASK], %%T2
1186 ;; clear selected partial lens not to do them twice
1187 vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10
1189 ;; copy IV, in and out pointers
1190 vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)]
1191 vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)]
1192 vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)]
1193 vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)]
1194 vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)]
1195 vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)]
1196 vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1
1197 vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2
1198 vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3
1199 vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4
1200 vmovdqu64 [%%T_IV + (0*64)], %%T5
1201 vmovdqu64 [%%T_IV + (1*64)], %%T6
1203 ;; calculate last block case mask
1204 ;; - first block case requires no modifications to in/out/IV
1205 vmovdqu64 %%T1, [STATE + _des_args_BLen]
1206 vpcmpd k2, %%T1, %%T10, 4 ; NEQ
1207 kmovw DWORD(IA1), k2
1208 and DWORD(IA1), DWORD(IA0)
1209 jz %%_des_cfb_one_no_last_blocks
1211 ;; set up IV, in and out for the last block case
1212 ;; - Last block needs in and out to be set differently (decryption only)
1213 ;; - IA1 holds the last block mask
1214 %ifidn %%ENC_DEC, DEC
1215 mov DWORD(IA0), DWORD(IA1)
1216 mov DWORD(IA2), DWORD(IA1)
1218 and DWORD(IA2), 0xff
1219 kmovw k4, DWORD(IA2)
1220 kmovw k5, DWORD(IA1)
1221 vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)]
1222 vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)]
1223 vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)]
1224 vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)]
1225 vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1
1226 vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2
1227 vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3
1228 vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4
1230 ;; - IV has to be set differently for CFB as well
1231 ;; - IA0 holds the last block mask
1234 test DWORD(IA0), (1 << IDX)
1235 jz %%_des_cfb_one_copy_iv_next %+ IDX
1236 %ifidn %%ENC_DEC, ENC
1237 mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)]
1239 mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)]
1242 mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2)
1244 mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2)
1245 %%_des_cfb_one_copy_iv_next %+ IDX:
1246 %assign IDX (IDX + 1)
1249 %%_des_cfb_one_no_last_blocks:
1250 ;; Uffff ... finally let's do some DES CFB
1251 ;; - let's use T_IN, T_OUT, T_IV and T_MASK
1253 ;; - load data with the corresponding masks & transpose
1254 ;; - T0 to T15 will hold the data
1259 mov IA1, [%%T_IN + (IDX*PTR_SZ)]
1260 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1261 kmovq k %+ K_IDX, IA0
1262 vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1]
1263 %assign IDX (IDX + 1)
1264 %assign K_IDX (K_IDX + 1)
1266 %assign K_IDX 1 ; iterate through K1 to K7
1269 ;; - transpose the data in T0 to T15, T16 to T23 are clobbered
1270 TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23
1272 ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1
1273 vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0
1274 vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1
1278 DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13
1279 ;; CFB style xor with R0/L0 with IV
1282 vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1
1283 vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0
1284 vmovdqa64 %%T1, %%T2
1285 ;; - new R0 = L0 ^ IV0 (%%T0)
1286 ;; - new L0 = R0 ^ IV1 (%%T1)
1288 ;; Transpose the data out
1289 ;; - %%T2 to %%T24 clobbered
1290 TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24
1292 ;; Store the transposed data
1293 ;; - T0 to T15 will hold the data
1298 mov IA1, [%%T_OUT + (IDX*PTR_SZ)]
1299 mov DWORD(IA0), [%%T_MASK + (IDX*4)]
1300 kmovq k %+ K_IDX, IA0
1301 vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX
1302 %assign IDX (IDX + 1)
1303 %assign K_IDX (K_IDX + 1)
1305 %assign K_IDX 1 ; iterate through K1 to K7
1310 ;; Clear copied IV's
1312 vmovdqu64 [%%T_IV + (0*64)], %%T5
1313 vmovdqu64 [%%T_IV + (1*64)], %%T5
1320 ;;; ===========================================================================
1321 ;;; Converts length into mask of DES blocks
1322 ;;; ===========================================================================
1324 ;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64)
1325 ;;; USES: IA0, IA1 IA2
1326 ;;; ASSUMES: SIZE - OFFSET < 64
1338 ;; - myrcx - remaining length
1339 ;; - divide by 8 (DES block size)
1340 ;; - create bit mask of the result
1341 mov DWORD(%%MASK), 1
1343 shl DWORD(%%MASK), BYTE(myrcx)
1344 sub DWORD(%%MASK), 1
1350 ;;; ===========================================================================
1351 ;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1352 ;;; ===========================================================================
1354 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1355 ;;; DES_KS [in] - pointer to transposed key schedule
1357 ;;; NOTE: clobbers OpMask registers
1358 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1359 %macro GEN_DES_ENC_CIPHER 2
1360 %define %%NUM_DES_BLOCKS %1
1367 %rep %%NUM_DES_BLOCKS - 1
1368 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1369 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1370 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1373 %assign RNN (RNN + 2)
1374 %assign LNN (LNN + 2)
1376 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1377 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1378 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1381 ;;; ===========================================================================
1382 ;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1383 ;;; ===========================================================================
1385 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1386 ;;; DES_KS [in] - pointer to transposed key schedule
1388 ;;; NOTE: clobbers OpMask registers
1389 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1390 %macro GEN_DES_DEC_CIPHER 2
1391 %define %%NUM_DES_BLOCKS %1
1396 %rep %%NUM_DES_BLOCKS
1397 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1398 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1399 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1400 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1401 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1402 vmovdqa64 ZIV0, ZTMP12
1403 vmovdqa64 ZIV1, ZTMP13
1409 ;;; ===========================================================================
1410 ;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
1411 ;;; ===========================================================================
1413 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1414 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1415 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1416 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1418 ;;; NOTE: clobbers OpMask registers
1419 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1420 %macro GEN_3DES_ENC_CIPHER 4
1421 %define %%NUM_DES_BLOCKS %1
1422 %define %%DES_KS1 %2
1423 %define %%DES_KS2 %3
1424 %define %%DES_KS3 %4
1430 %rep %%NUM_DES_BLOCKS
1432 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1434 DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1436 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1437 %if (RNN < (%%NUM_DES_BLOCKS * 2))
1438 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
1439 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
1441 vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
1442 vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
1447 %assign RNN (RNN + 2)
1448 %assign LNN (LNN + 2)
1453 ;;; ===========================================================================
1454 ;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
1455 ;;; ===========================================================================
1457 ;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
1458 ;;; DES_KS1 [in] - pointer to transposed key schedule 1
1459 ;;; DES_KS2 [in] - pointer to transposed key schedule 2
1460 ;;; DES_KS3 [in] - pointer to transposed key schedule 3
1462 ;;; NOTE: clobbers OpMask registers
1463 ;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
1464 %macro GEN_3DES_DEC_CIPHER 4
1465 %define %%NUM_DES_BLOCKS %1
1466 %define %%DES_KS1 %2
1467 %define %%DES_KS2 %3
1468 %define %%DES_KS3 %4
1472 %rep %%NUM_DES_BLOCKS
1473 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
1474 vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
1476 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1478 DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1480 DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1481 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
1482 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
1483 vmovdqa64 ZIV0, ZTMP12
1484 vmovdqa64 ZIV1, ZTMP13
1492 ;;; ===========================================================================
1493 ;;; DES CBC / DOCSIS DES ENCRYPT
1494 ;;; ===========================================================================
1496 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1499 ;;; NOTE: clobbers OpMask registers
1500 %macro GENERIC_DES_ENC 1
1501 %define %%DES_DOCSIS %1
1503 ;; push the registers and allocate the stack frame
1505 sub rsp, STACKFRAME_size
1507 mov [rsp + _rsp_save], rax ; original SP
1508 mov [rsp + _gpr_save + 0*8], r12
1509 mov [rsp + _gpr_save + 1*8], r13
1510 mov [rsp + _gpr_save + 2*8], r14
1511 mov [rsp + _gpr_save + 3*8], r15
1513 %ifnidn %%DES_DOCSIS, 3DES
1514 ;; DES and DOCSIS DES
1515 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1518 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC
1520 mov [rsp + _size_save], SIZE
1523 ;; This loop processes message in blocks of 64 bytes.
1524 ;; Anything smaller than 64 bytes is handled separately after the loop.
1525 %%_gen_des_enc_loop:
1527 jz %%_gen_des_enc_loop_end
1529 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1530 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1531 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1532 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1533 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1534 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1535 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1536 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1537 vmovdqu64 ZW0, [IA0 + OFFSET]
1538 vmovdqu64 ZW1, [IA1 + OFFSET]
1539 vmovdqu64 ZW2, [IA2 + OFFSET]
1540 vmovdqu64 ZW3, [INP0 + OFFSET]
1541 vmovdqu64 ZW4, [INP1 + OFFSET]
1542 vmovdqu64 ZW5, [INP2 + OFFSET]
1543 vmovdqu64 ZW6, [INP3 + OFFSET]
1544 vmovdqu64 ZW7, [INP4 + OFFSET]
1546 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1547 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1548 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1549 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1550 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1551 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1552 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1553 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1554 vmovdqu64 ZW8, [IA0 + OFFSET]
1555 vmovdqu64 ZW9, [IA1 + OFFSET]
1556 vmovdqu64 ZW10, [IA2 + OFFSET]
1557 vmovdqu64 ZW11, [INP0 + OFFSET]
1558 vmovdqu64 ZW12, [INP1 + OFFSET]
1559 vmovdqu64 ZW13, [INP2 + OFFSET]
1560 vmovdqu64 ZW14, [INP3 + OFFSET]
1561 vmovdqu64 ZW15, [INP4 + OFFSET]
1564 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1566 ;; DES CBC ENC comes here
1567 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1568 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1570 %ifnidn %%DES_DOCSIS, 3DES
1571 GEN_DES_ENC_CIPHER 8, rsp + _key_sched
1573 GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1576 ;; transpose data on output
1577 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1579 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1580 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1581 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1582 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1583 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1584 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1585 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1586 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1587 vmovdqu64 [IA0 + OFFSET], ZW0
1588 vmovdqu64 [IA1 + OFFSET], ZW1
1589 vmovdqu64 [IA2 + OFFSET], ZW2
1590 vmovdqu64 [INP0 + OFFSET], ZW3
1591 vmovdqu64 [INP1 + OFFSET], ZW4
1592 vmovdqu64 [INP2 + OFFSET], ZW5
1593 vmovdqu64 [INP3 + OFFSET], ZW6
1594 vmovdqu64 [INP4 + OFFSET], ZW7
1596 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1597 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1598 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1599 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1600 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1601 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1602 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1603 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1604 vmovdqu64 [IA0 + OFFSET], ZW8
1605 vmovdqu64 [IA1 + OFFSET], ZW9
1606 vmovdqu64 [IA2 + OFFSET], ZW10
1607 vmovdqu64 [INP0 + OFFSET], ZW11
1608 vmovdqu64 [INP1 + OFFSET], ZW12
1609 vmovdqu64 [INP2 + OFFSET], ZW13
1610 vmovdqu64 [INP3 + OFFSET], ZW14
1611 vmovdqu64 [INP4 + OFFSET], ZW15
1614 jmp %%_gen_des_enc_loop
1615 %%_gen_des_enc_loop_end:
1616 ;; This is where we check if there is anything less than 64 bytes
1617 ;; of message left for processing.
1618 mov SIZE, [rsp + _size_save]
1620 jz %%_gen_des_enc_part_end
1621 ;; calculate min of bytes_left and 64, convert to qword mask
1622 GET_MASK8 IA0 ; IA0 = mask
1624 kmovw k7, DWORD(IA0)
1625 mov [rsp + _mask_save], IA0
1627 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1628 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1629 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1630 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1631 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1632 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1633 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1634 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1635 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1636 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1637 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1638 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1639 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1640 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1641 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1642 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1644 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1645 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1646 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1647 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1648 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1649 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1650 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1651 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1652 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1653 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1654 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1655 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1656 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1657 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1658 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1659 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1662 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1664 ;; DES CBC ENC comes here
1665 vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
1666 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
1668 mov IA0, [rsp + _mask_save]
1677 ;; process one block and move to transpose out
1678 %ifnidn %%DES_DOCSIS, 3DES
1679 GEN_DES_ENC_CIPHER 1, rsp + _key_sched
1681 GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1683 jmp %%_transpose_out
1686 ;; process two blocks and move to transpose out
1687 %ifnidn %%DES_DOCSIS, 3DES
1688 GEN_DES_ENC_CIPHER 2, rsp + _key_sched
1690 GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1692 jmp %%_transpose_out
1695 ;; process three blocks and move to transpose out
1696 %ifnidn %%DES_DOCSIS, 3DES
1697 GEN_DES_ENC_CIPHER 3, rsp + _key_sched
1699 GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1701 jmp %%_transpose_out
1704 ;; process four blocks and move to transpose out
1705 %ifnidn %%DES_DOCSIS, 3DES
1706 GEN_DES_ENC_CIPHER 4, rsp + _key_sched
1708 GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1710 jmp %%_transpose_out
1717 ;; process five blocks and move to transpose out
1718 %ifnidn %%DES_DOCSIS, 3DES
1719 GEN_DES_ENC_CIPHER 5, rsp + _key_sched
1721 GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1723 jmp %%_transpose_out
1726 ;; process six blocks and move to transpose out
1727 %ifnidn %%DES_DOCSIS, 3DES
1728 GEN_DES_ENC_CIPHER 6, rsp + _key_sched
1730 GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1732 jmp %%_transpose_out
1735 ;; process seven blocks and move to transpose out
1736 %ifnidn %%DES_DOCSIS, 3DES
1737 GEN_DES_ENC_CIPHER 7, rsp + _key_sched
1739 GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1743 ;; transpose data on output
1744 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1746 ;; run masked stores
1747 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1748 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1749 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1750 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1751 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1752 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1753 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1754 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1755 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
1756 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
1757 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
1758 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
1759 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
1760 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
1761 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
1762 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
1764 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1765 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1766 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1767 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1768 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1769 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1770 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1771 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1772 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
1773 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
1774 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
1775 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
1776 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
1777 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
1778 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
1779 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
1780 %%_gen_des_enc_part_end:
1782 ;; store IV and update pointers
1783 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
1785 ;; CFB part for DOCSIS
1786 %ifidn %%DES_DOCSIS, DOCSIS
1787 DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1790 CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
1792 ;; restore stack pointer and registers
1793 mov r12, [rsp + _gpr_save + 0*8]
1794 mov r13, [rsp + _gpr_save + 1*8]
1795 mov r14, [rsp + _gpr_save + 2*8]
1796 mov r15, [rsp + _gpr_save + 3*8]
1797 mov rsp, [rsp + _rsp_save] ; original SP
1800 ;;; ===========================================================================
1801 ;;; DES CBC / DOCSIS DES DECRYPT
1802 ;;; ===========================================================================
1804 ;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
1807 ;;; NOTE: clobbers OpMask registers
1808 %macro GENERIC_DES_DEC 1
1809 %define %%DES_DOCSIS %1
1811 ;; push the registers and allocate the stack frame
1813 sub rsp, STACKFRAME_size
1815 mov [rsp + _rsp_save], rax ; original SP
1816 mov [rsp + _gpr_save + 0*8], r12
1817 mov [rsp + _gpr_save + 1*8], r13
1818 mov [rsp + _gpr_save + 2*8], r14
1819 mov [rsp + _gpr_save + 3*8], r15
1821 %ifnidn %%DES_DOCSIS, 3DES
1823 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
1826 DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC
1829 ;; CFB part for DOCSIS
1830 %ifidn %%DES_DOCSIS, DOCSIS
1831 DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
1834 mov [rsp + _size_save], SIZE
1837 ;; This loop processes message in blocks of 64 bytes.
1838 ;; Anything smaller than 64 bytes is handled separately after the loop.
1839 %%_gen_des_dec_loop:
1841 jz %%_gen_des_dec_loop_end
1843 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1844 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1845 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1846 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1847 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1848 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1849 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1850 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1851 vmovdqu64 ZW0, [IA0 + OFFSET]
1852 vmovdqu64 ZW1, [IA1 + OFFSET]
1853 vmovdqu64 ZW2, [IA2 + OFFSET]
1854 vmovdqu64 ZW3, [INP0 + OFFSET]
1855 vmovdqu64 ZW4, [INP1 + OFFSET]
1856 vmovdqu64 ZW5, [INP2 + OFFSET]
1857 vmovdqu64 ZW6, [INP3 + OFFSET]
1858 vmovdqu64 ZW7, [INP4 + OFFSET]
1860 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1861 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1862 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1863 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1864 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1865 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1866 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1867 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1868 vmovdqu64 ZW8, [IA0 + OFFSET]
1869 vmovdqu64 ZW9, [IA1 + OFFSET]
1870 vmovdqu64 ZW10, [IA2 + OFFSET]
1871 vmovdqu64 ZW11, [INP0 + OFFSET]
1872 vmovdqu64 ZW12, [INP1 + OFFSET]
1873 vmovdqu64 ZW13, [INP2 + OFFSET]
1874 vmovdqu64 ZW14, [INP3 + OFFSET]
1875 vmovdqu64 ZW15, [INP4 + OFFSET]
1878 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1880 %ifnidn %%DES_DOCSIS, 3DES
1881 ;; DES CBC DEC comes here
1882 GEN_DES_DEC_CIPHER 8, rsp + _key_sched
1884 ;; 3DES CBC DEC comes here
1885 GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1888 ;; transpose data on output
1889 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1892 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
1893 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
1894 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
1895 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
1896 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
1897 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
1898 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
1899 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
1900 vmovdqu64 [IA0 + OFFSET], ZW0
1901 vmovdqu64 [IA1 + OFFSET], ZW1
1902 vmovdqu64 [IA2 + OFFSET], ZW2
1903 vmovdqu64 [INP0 + OFFSET], ZW3
1904 vmovdqu64 [INP1 + OFFSET], ZW4
1905 vmovdqu64 [INP2 + OFFSET], ZW5
1906 vmovdqu64 [INP3 + OFFSET], ZW6
1907 vmovdqu64 [INP4 + OFFSET], ZW7
1909 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
1910 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
1911 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
1912 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
1913 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
1914 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
1915 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
1916 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
1917 vmovdqu64 [IA0 + OFFSET], ZW8
1918 vmovdqu64 [IA1 + OFFSET], ZW9
1919 vmovdqu64 [IA2 + OFFSET], ZW10
1920 vmovdqu64 [INP0 + OFFSET], ZW11
1921 vmovdqu64 [INP1 + OFFSET], ZW12
1922 vmovdqu64 [INP2 + OFFSET], ZW13
1923 vmovdqu64 [INP3 + OFFSET], ZW14
1924 vmovdqu64 [INP4 + OFFSET], ZW15
1927 jmp %%_gen_des_dec_loop
1928 %%_gen_des_dec_loop_end:
1929 ;; This is where we check if there is anything less than 64 bytes
1930 ;; of message left for processing.
1931 mov SIZE, [rsp + _size_save]
1933 jz %%_gen_des_dec_part_end
1934 ;; calculate min of bytes_left and 64, convert to qword mask
1935 GET_MASK8 IA0 ; IA0 = mask
1937 kmovw k7, DWORD(IA0)
1938 mov [rsp + _mask_save], IA0
1940 mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
1941 mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
1942 mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
1943 mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
1944 mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
1945 mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
1946 mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
1947 mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
1948 vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
1949 vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
1950 vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
1951 vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
1952 vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
1953 vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
1954 vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
1955 vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
1957 mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
1958 mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
1959 mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
1960 mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
1961 mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
1962 mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
1963 mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
1964 mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
1965 vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
1966 vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
1967 vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
1968 vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
1969 vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
1970 vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
1971 vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
1972 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
1975 TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
1977 ;; DES CBC DEC comes here
1978 mov IA0, [rsp + _mask_save]
1986 ;; process one block and move to transpose out
1987 %ifnidn %%DES_DOCSIS, 3DES
1988 GEN_DES_DEC_CIPHER 1, rsp + _key_sched
1990 GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
1992 jmp %%_transpose_out
1995 ;; process two blocks and move to transpose out
1996 %ifnidn %%DES_DOCSIS, 3DES
1997 GEN_DES_DEC_CIPHER 2, rsp + _key_sched
1999 GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2001 jmp %%_transpose_out
2004 ;; process three blocks and move to transpose out
2005 %ifnidn %%DES_DOCSIS, 3DES
2006 GEN_DES_DEC_CIPHER 3, rsp + _key_sched
2008 GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2010 jmp %%_transpose_out
2013 ;; process four blocks and move to transpose out
2014 %ifnidn %%DES_DOCSIS, 3DES
2015 GEN_DES_DEC_CIPHER 4, rsp + _key_sched
2017 GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2019 jmp %%_transpose_out
2026 ;; process five blocks and move to transpose out
2027 %ifnidn %%DES_DOCSIS, 3DES
2028 GEN_DES_DEC_CIPHER 5, rsp + _key_sched
2030 GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2032 jmp %%_transpose_out
2035 ;; process six blocks and move to transpose out
2036 %ifnidn %%DES_DOCSIS, 3DES
2037 GEN_DES_DEC_CIPHER 6, rsp + _key_sched
2039 GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2041 jmp %%_transpose_out
2044 ;; process seven blocks and move to transpose out
2045 %ifnidn %%DES_DOCSIS, 3DES
2046 GEN_DES_DEC_CIPHER 7, rsp + _key_sched
2048 GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
2052 ;; transpose data on output
2053 TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
2055 ;; run masked stores
2056 mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
2057 mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
2058 mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
2059 mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
2060 mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
2061 mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
2062 mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
2063 mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
2064 vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
2065 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
2066 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
2067 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
2068 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
2069 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
2070 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
2071 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
2073 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
2074 mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
2075 mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
2076 mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
2077 mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
2078 mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
2079 mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
2080 mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
2081 vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
2082 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
2083 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
2084 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
2085 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
2086 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
2087 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
2088 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
2089 %%_gen_des_dec_part_end:
2091 ;; store IV and update pointers
2092 DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
2094 CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
2096 ;; restore stack pointer and registers
2097 mov r12, [rsp + _gpr_save + 0*8]
2098 mov r13, [rsp + _gpr_save + 1*8]
2099 mov r14, [rsp + _gpr_save + 2*8]
2100 mov r15, [rsp + _gpr_save + 3*8]
2101 mov rsp, [rsp + _rsp_save] ; original SP
2105 ;;; ========================================================
2112 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2113 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2114 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2115 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
2116 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2117 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2118 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2119 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
2120 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2121 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2122 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2123 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
2124 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2125 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2126 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2127 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
2128 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2129 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2130 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2131 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
2132 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2133 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2134 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2135 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
2136 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2137 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2138 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2139 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
2140 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2141 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2142 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2143 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
2144 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2145 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2146 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2147 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
2148 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2149 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2150 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2151 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
2152 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2153 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2154 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2155 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
2156 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2157 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2158 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2159 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
2160 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2161 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2162 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2163 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
2164 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2165 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2166 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2167 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
2168 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2169 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2170 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2171 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
2172 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2173 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2174 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2175 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
2176 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2177 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2178 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2179 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
2180 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2181 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2182 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2183 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
2184 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2185 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2186 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2187 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
2191 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2192 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2193 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2194 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
2195 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2196 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2197 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2198 dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
2199 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2200 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2201 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2202 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
2203 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2204 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2205 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2206 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
2207 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2208 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2209 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2210 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
2216 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c
2217 dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a
2218 dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05
2219 dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00
2220 dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05
2221 dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06
2222 dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00
2223 dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b
2225 dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09
2226 dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04
2227 dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c
2228 dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f
2229 dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00
2230 dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07
2231 dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a
2232 dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09
2234 dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a
2235 dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07
2236 dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05
2237 dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e
2238 dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d
2239 dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04
2240 dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a
2241 dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03
2243 dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a
2244 dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01
2245 dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04
2246 dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02
2247 dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03
2248 dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04
2249 dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e
2250 dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07
2252 dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06
2253 dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00
2254 dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c
2255 dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07
2256 dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05
2257 dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a
2258 dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02
2259 dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c
2261 dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08
2262 dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d
2263 dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b
2264 dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06
2265 dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06
2266 dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01
2267 dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00
2268 dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b
2270 dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00
2271 dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09
2272 dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a
2273 dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04
2274 dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07
2275 dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c
2276 dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04
2277 dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03
2279 dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f
2280 dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a
2281 dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c
2282 dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01
2283 dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c
2284 dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06
2285 dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a
2286 dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d
2288 ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1
2291 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
2295 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2296 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2297 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2298 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
2302 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2303 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2304 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2305 dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
2309 dq 0x0d0c090805040100, 0x0f0e0b0a07060302
2310 dq 0x1d1c191815141110, 0x1f1e1b1a17161312
2311 dq 0x2d2c292825242120, 0x2f2e2b2a27262322
2312 dq 0x3d3c393835343130, 0x3f3e3b3a37363332
2316 dq 0x001f001f001f001f, 0x001f001f001f001f
2317 dq 0x001f001f001f001f, 0x001f001f001f001f
2318 dq 0x001f001f001f001f, 0x001f001f001f001f
2319 dq 0x001f001f001f001f, 0x001f001f001f001f
2323 dq 0x0705060403010200, 0x0f0d0e0c0b090a08
2324 dq 0x1715161413111210, 0x1f1d1e1c1b191a18
2325 dq 0x2725262423212220, 0x2f2d2e2c2b292a28
2326 dq 0x3735363433313230, 0x3f3d3e3c3b393a38
2328 ;;; ========================================================
2332 ;;; arg 1 : pointer to DES OOO structure
2333 ;;; arg 2 : size in bytes
2335 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal)
2336 des_x16_cbc_enc_avx512:
2340 ;;; arg 1 : pointer to DES OOO structure
2341 ;;; arg 2 : size in bytes
2343 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal)
2344 des_x16_cbc_dec_avx512:
2348 ;;; arg 1 : pointer to DES OOO structure
2349 ;;; arg 2 : size in bytes
2351 MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal)
2352 des3_x16_cbc_enc_avx512:
2353 GENERIC_DES_ENC 3DES
2356 ;;; arg 1 : pointer to DES OOO structure
2357 ;;; arg 2 : size in bytes
2359 MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal)
2360 des3_x16_cbc_dec_avx512:
2361 GENERIC_DES_DEC 3DES
2364 ;;; arg 1 : pointer to DES OOO structure
2365 ;;; arg 2 : size in bytes
2367 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal)
2368 docsis_des_x16_enc_avx512:
2369 GENERIC_DES_ENC DOCSIS
2372 ;;; arg 1 : pointer to DES OOO structure
2373 ;;; arg 2 : size in bytes
2375 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal)
2376 docsis_des_x16_dec_avx512:
2377 GENERIC_DES_DEC DOCSIS
2381 section .note.GNU-stack noalloc noexec nowrite progbits