]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha1_one_block_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ; SHA1 code, hybrid, rolled, interleaved
29 ; Uses AVX instructions
30 %include "os.asm"
31
32 section .data
33 default rel
34 align 16
35 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
36 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
37 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
38 dq 0x5A8279995A827999, 0x5A8279995A827999
39 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
40 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
41 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
42 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
43 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
44 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
45
46 section .text
47
48 %define VMOVDQ vmovdqu ;; assume buffers not aligned
49
50 %ifdef LINUX
51 %define INP rdi ; 1st arg
52 %define CTX rsi ; 2nd arg
53 %define REG3 ecx
54 %define REG4 edx
55 %else
56 %define INP rcx ; 1st arg
57 %define CTX rdx ; 2nd arg
58 %define REG3 edi
59 %define REG4 esi
60 %endif
61
62 %define FRAMESZ 3*16 + 1*8
63 %define _RSP FRAMESZ-1*8 + rsp
64
65 %define a eax
66 %define b ebx
67 %define c REG3
68 %define d REG4
69 %define e r8d
70 %define T1 r9d
71 %define f r10d
72 %define RND r11d
73 %define g r12d
74 %define h r13d
75
76 %define XTMP0 xmm0
77 %define XTMP1 xmm1
78 %define XK xmm2
79
80 %xdefine X0 xmm3
81 %xdefine X1 xmm4
82 %xdefine X2 xmm5
83 %xdefine X3 xmm6
84 %xdefine X4 xmm7
85
86 %define XFER xmm8
87
88 %define SZ 4
89
90 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
91
92 %macro rotate_Xs 0
93 %xdefine X_ X0
94 %xdefine X0 X1
95 %xdefine X1 X2
96 %xdefine X2 X3
97 %xdefine X3 X4
98 %xdefine X4 X_
99 %endmacro
100
101 %macro ROTATE_ARGS 0
102 %xdefine TMP_ h
103 %xdefine h g
104 %xdefine g f
105 %xdefine f e
106 %xdefine e d
107 %xdefine d c
108 %xdefine c b
109 %xdefine b a
110 %xdefine a TMP_
111 %endm
112
113
114 ;; Magic functions defined in FIPS 180-1
115 ;;
116 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
117 %macro MAGIC_F0 5
118 %define %%regF %1
119 %define %%regB %2
120 %define %%regC %3
121 %define %%regD %4
122 %define %%regT %5
123 mov %%regF,%%regC
124 xor %%regF,%%regD
125 and %%regF,%%regB
126 xor %%regF,%%regD
127 %endmacro
128
129 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
130 %macro MAGIC_F1 5
131 %define %%regF %1
132 %define %%regB %2
133 %define %%regC %3
134 %define %%regD %4
135 %define %%regT %5
136 mov %%regF,%%regD
137 xor %%regF,%%regC
138 xor %%regF,%%regB
139 %endmacro
140
141 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
142 %macro MAGIC_F2 5
143 %define %%regF %1
144 %define %%regB %2
145 %define %%regC %3
146 %define %%regD %4
147 %define %%regT %5
148 mov %%regF,%%regB
149 mov %%regT,%%regB
150 or %%regF,%%regC
151 and %%regT,%%regC
152 and %%regF,%%regD
153 or %%regF,%%regT
154 %endmacro
155
156 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
157 %macro MAGIC_F3 5
158 %define %%regF %1
159 %define %%regB %2
160 %define %%regC %3
161 %define %%regD %4
162 %define %%regT %5
163 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
164 %endmacro
165
166 ;; input is T1
167 %macro ROUND 1
168 %define %%MAGIC %1
169 add e,T1
170 mov T1,a
171 rol T1,5
172 add e,T1
173 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
174 rol b,30
175 add h,e
176 ROTATE_ARGS
177 %endmacro
178
179 %macro do_4i 1
180 vpaddd XFER, XK, X0
181 vpextrd T1, XFER, 0
182 ;ROUND %1
183 add e,T1
184 ;SCHEDULE_4
185 vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14]
186 mov T1,a
187 rol T1,5
188 vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16]
189 add e,T1
190 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
191 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
192
193 ;; Finish low half
194 rol b,30
195 vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA}
196 add h,e
197 ROTATE_ARGS
198 vpextrd T1, XFER, 1
199 ;ROUND %1
200 add e,T1
201 vpxor X4, X4, XTMP0
202 mov T1,a
203 rol T1,5
204 ;; rotate X4 left 1
205 vpsrld XTMP1, X4, (32-1)
206 add e,T1
207 vpslld X4, X4, 1
208 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
209 vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA}
210 rol b,30
211 add h,e
212 ROTATE_ARGS
213 vpextrd T1, XFER, 2
214 ;ROUND %1
215 add e,T1
216 mov T1,a
217
218 ;; Finish high half
219 vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx}
220 rol T1,5
221 add e,T1
222 vpxor XTMP0, XTMP0, XTMP1
223 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
224 ;; rotate XTMP0 left 1
225 vpsrld XTMP1, XTMP0, (32-1)
226 rol b,30
227 add h,e
228 ROTATE_ARGS
229 vpextrd T1, XFER, 3
230 ;ROUND %1
231 add e,T1
232 mov T1,a
233 vpslld XTMP0, XTMP0, 1
234 rol T1,5
235 add e,T1
236 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
237 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
238 ;; COMBINE HALVES
239 vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
240 rol b,30
241 add h,e
242
243 rotate_Xs
244 ROTATE_ARGS
245 %endmacro
246
247 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
248 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
249 ;; void sha1_one_block_avx(void *input_data, UINT32 digest[8]
250 ;; arg 1 : rcx : pointer to input data
251 ;; arg 2 : rdx : pointer to digest
252 MKGLOBAL(sha1_one_block_avx,function,)
253 align 32
254 sha1_one_block_avx:
255 push rbx
256 push rsi
257 push rdi
258 push r12
259 push r13
260
261 ;; byte swap first 16 dwords
262 vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
263 mov rax,rsp ; copy rsp
264 VMOVDQ X0, [INP + 0*16]
265 sub rsp,FRAMESZ
266 VMOVDQ X1, [INP + 1*16]
267 and rsp,-64 ; align stack frame
268
269 vmovdqa [rsp + 0 * 16], xmm6
270 vmovdqa [rsp + 1 * 16], xmm7
271 vmovdqa [rsp + 2 * 16], xmm8
272
273 VMOVDQ X2, [INP + 2*16]
274 mov [_RSP],rax ; save copy of rsp
275 VMOVDQ X3, [INP + 3*16]
276 ;; load initial digest
277 mov a,0x67452301
278 vpshufb X0, XTMP0
279 mov b,0xefcdab89
280 vpshufb X1, XTMP0
281 mov c,0x98badcfe
282 vpshufb X2, XTMP0
283 mov d,0x10325476
284 vpshufb X3, XTMP0
285 mov e,0xc3d2e1f0
286
287 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
288 ;; do rounds 00-19
289 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
290 vmovdqa XK, [rel K00_19]
291 mov RND, 3
292 ROTATE_ARGS
293 ROTATE_ARGS
294 ROTATE_ARGS
295 ROTATE_ARGS
296 rotate_Xs
297 rotate_Xs
298 rotate_Xs
299 rotate_Xs
300 jmp loop1_5
301 align 16
302 loop1:
303
304 do_4i MAGIC_F0
305
306 loop1_5:
307 do_4i MAGIC_F0
308
309 rotate_Xs
310 rotate_Xs
311 rotate_Xs
312 rotate_Xs
313 vmovdqa X0, X2
314 vmovdqa X2, X4
315 vmovdqa X4, X1
316 vmovdqa X1, X3
317
318 sub RND, 1
319 jne loop1
320
321 rotate_Xs
322 rotate_Xs
323 rotate_Xs
324 rotate_Xs
325
326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
327 ;; end rounds 00-19
328 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329
330
331
332
333 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
334 ;; do rounds 20-39
335 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336 vmovdqa XK, [rel K20_39]
337 mov RND, 3
338 ROTATE_ARGS
339 ROTATE_ARGS
340 ROTATE_ARGS
341 ROTATE_ARGS
342 rotate_Xs
343 rotate_Xs
344 rotate_Xs
345 rotate_Xs
346 jmp loop2_5
347 align 16
348 loop2:
349
350 do_4i MAGIC_F1
351
352 loop2_5:
353 do_4i MAGIC_F1
354
355 rotate_Xs
356 rotate_Xs
357 rotate_Xs
358 rotate_Xs
359 vmovdqa X0, X2
360 vmovdqa X2, X4
361 vmovdqa X4, X1
362 vmovdqa X1, X3
363
364 sub RND, 1
365 jne loop2
366
367 rotate_Xs
368 rotate_Xs
369 rotate_Xs
370 rotate_Xs
371
372 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
373 ;; end rounds 20-39
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375
376
377
378
379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
380 ;; do rounds 40-59
381 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
382 vmovdqa XK, [rel K40_59]
383 mov RND, 3
384 ROTATE_ARGS
385 ROTATE_ARGS
386 ROTATE_ARGS
387 ROTATE_ARGS
388 rotate_Xs
389 rotate_Xs
390 rotate_Xs
391 rotate_Xs
392 jmp loop3_5
393 align 16
394 loop3:
395
396 do_4i MAGIC_F2
397
398 loop3_5:
399 do_4i MAGIC_F2
400
401 rotate_Xs
402 rotate_Xs
403 rotate_Xs
404 rotate_Xs
405 vmovdqa X0, X2
406 vmovdqa X2, X4
407 vmovdqa X4, X1
408 vmovdqa X1, X3
409
410 sub RND, 1
411 jne loop3
412
413 rotate_Xs
414 rotate_Xs
415 rotate_Xs
416 rotate_Xs
417
418 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
419 ;; end rounds 40-59
420 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
421
422
423
424
425 ;; do rounds 60-79
426 vmovdqa XK, [rel K60_79]
427
428 do_4i MAGIC_F3
429
430 vpaddd XFER, XK, X0
431 vpextrd T1, XFER, 0
432 ROUND MAGIC_F3
433 vpextrd T1, XFER, 1
434 ROUND MAGIC_F3
435 vpextrd T1, XFER, 2
436 ROUND MAGIC_F3
437 vpextrd T1, XFER, 3
438 ROUND MAGIC_F3
439
440 vpaddd XFER, XK, X1
441 vpextrd T1, XFER, 0
442 ROUND MAGIC_F3
443 vpextrd T1, XFER, 1
444 ROUND MAGIC_F3
445 vpextrd T1, XFER, 2
446 ROUND MAGIC_F3
447 vpextrd T1, XFER, 3
448 ROUND MAGIC_F3
449
450 vpaddd XFER, XK, X2
451 vpextrd T1, XFER, 0
452 ROUND MAGIC_F3
453 vpextrd T1, XFER, 1
454 ROUND MAGIC_F3
455 vpextrd T1, XFER, 2
456 ROUND MAGIC_F3
457 vpextrd T1, XFER, 3
458 ROUND MAGIC_F3
459
460 vpaddd XFER, XK, X3
461 vpextrd T1, XFER, 0
462 ROUND MAGIC_F3
463 vpextrd T1, XFER, 1
464 ROUND MAGIC_F3
465 vpextrd T1, XFER, 2
466 ROUND MAGIC_F3
467 vpextrd T1, XFER, 3
468 ROUND MAGIC_F3
469
470 add a,0x67452301
471 mov [SZ*0 + CTX], a
472 add b,0xefcdab89
473 mov [SZ*1 + CTX], b
474 add c,0x98badcfe
475 mov [SZ*2 + CTX], c
476 add d,0x10325476
477 mov [SZ*3 + CTX], d
478 add e,0xc3d2e1f0
479 mov [SZ*4 + CTX], e
480
481 vmovdqa xmm8, [rsp + 2 * 16]
482 vmovdqa xmm7, [rsp + 1 * 16]
483 vmovdqa xmm6, [rsp + 0 * 16]
484
485 mov rsp,[_RSP]
486 pop r13
487 pop r12
488 pop rdi
489 pop rsi
490 pop rbx
491
492 ret
493
494 %ifdef LINUX
495 section .note.GNU-stack noalloc noexec nowrite progbits
496 %endif