]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha1_one_block_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ; SHA1 code, hybrid, rolled, interleaved
29 ; Uses AVX instructions
30 %include "os.asm"
31
32 section .data
33 default rel
34 align 16
35 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
36 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
37 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
38 dq 0x5A8279995A827999, 0x5A8279995A827999
39 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
40 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
41 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
42 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
43 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
44 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
45
46 section .text
47
48 %define VMOVDQ vmovdqu ;; assume buffers not aligned
49
50 %ifdef LINUX
51 %define INP rdi ; 1st arg
52 %define CTX rsi ; 2nd arg
53 %define REG3 edx
54 %define REG4 ecx
55 %else
56 %define INP rcx ; 1st arg
57 %define CTX rdx ; 2nd arg
58 %define REG3 edi
59 %define REG4 esi
60 %endif
61
62 %define FRAMESZ 3*16 + 1*8
63 %define _RSP FRAMESZ-1*8 + rsp
64
65 %define a eax
66 %define b ebx
67 %define c REG3
68 %define d REG4
69 %define e r8d
70 %define T1 r9d
71 %define f r10d
72 %define RND r11d
73 %define g r12d
74 %define h r13d
75
76 %define XTMP0 xmm0
77 %define XTMP1 xmm1
78 %define XK xmm2
79
80 %xdefine X0 xmm3
81 %xdefine X1 xmm4
82 %xdefine X2 xmm5
83 %xdefine X3 xmm6
84 %xdefine X4 xmm7
85
86 %define XFER xmm8
87
88 %define SZ 4
89
90 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
91
92 %macro rotate_Xs 0
93 %xdefine X_ X0
94 %xdefine X0 X1
95 %xdefine X1 X2
96 %xdefine X2 X3
97 %xdefine X3 X4
98 %xdefine X4 X_
99 %endmacro
100
101 %macro ROTATE_ARGS 0
102 %xdefine TMP_ h
103 %xdefine h g
104 %xdefine g f
105 %xdefine f e
106 %xdefine e d
107 %xdefine d c
108 %xdefine c b
109 %xdefine b a
110 %xdefine a TMP_
111 %endm
112
113
114 ;; Magic functions defined in FIPS 180-1
115 ;;
116 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
117 %macro MAGIC_F0 5
118 %define %%regF %1
119 %define %%regB %2
120 %define %%regC %3
121 %define %%regD %4
122 %define %%regT %5
123 mov %%regF,%%regC
124 xor %%regF,%%regD
125 and %%regF,%%regB
126 xor %%regF,%%regD
127 %endmacro
128
129 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
130 %macro MAGIC_F1 5
131 %define %%regF %1
132 %define %%regB %2
133 %define %%regC %3
134 %define %%regD %4
135 %define %%regT %5
136 mov %%regF,%%regD
137 xor %%regF,%%regC
138 xor %%regF,%%regB
139 %endmacro
140
141 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
142 %macro MAGIC_F2 5
143 %define %%regF %1
144 %define %%regB %2
145 %define %%regC %3
146 %define %%regD %4
147 %define %%regT %5
148 mov %%regF,%%regB
149 mov %%regT,%%regB
150 or %%regF,%%regC
151 and %%regT,%%regC
152 and %%regF,%%regD
153 or %%regF,%%regT
154 %endmacro
155
156 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
157 %macro MAGIC_F3 5
158 %define %%regF %1
159 %define %%regB %2
160 %define %%regC %3
161 %define %%regD %4
162 %define %%regT %5
163 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
164 %endmacro
165
166 ;; input is T1
167 %macro ROUND 1
168 %define %%MAGIC %1
169 add e,T1
170 mov T1,a
171 rol T1,5
172 add e,T1
173 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
174 rol b,30
175 add h,e
176 ROTATE_ARGS
177 %endmacro
178
179 %macro do_4i 1
180 vpaddd XFER, XK, X0
181 vpextrd T1, XFER, 0
182 ;ROUND %1
183 add e,T1
184 ;SCHEDULE_4
185 vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14]
186 mov T1,a
187 rol T1,5
188 vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16]
189 add e,T1
190 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
191 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
192
193 ;; Finish low half
194 rol b,30
195 vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA}
196 add h,e
197 ROTATE_ARGS
198 vpextrd T1, XFER, 1
199 ;ROUND %1
200 add e,T1
201 vpxor X4, X4, XTMP0
202 mov T1,a
203 rol T1,5
204 ;; rotate X4 left 1
205 vpsrld XTMP1, X4, (32-1)
206 add e,T1
207 vpslld X4, X4, 1
208 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
209 vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA}
210 rol b,30
211 add h,e
212 ROTATE_ARGS
213 vpextrd T1, XFER, 2
214 ;ROUND %1
215 add e,T1
216 mov T1,a
217
218 ;; Finish high half
219 vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx}
220 rol T1,5
221 add e,T1
222 vpxor XTMP0, XTMP0, XTMP1
223 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
224 ;; rotate XTMP0 left 1
225 vpsrld XTMP1, XTMP0, (32-1)
226 rol b,30
227 add h,e
228 ROTATE_ARGS
229 vpextrd T1, XFER, 3
230 ;ROUND %1
231 add e,T1
232 mov T1,a
233 vpslld XTMP0, XTMP0, 1
234 rol T1,5
235 add e,T1
236 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
237 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
238 ;; COMBINE HALVES
239 vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
240 rol b,30
241 add h,e
242
243 rotate_Xs
244 ROTATE_ARGS
245 %endmacro
246
247 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
248 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
249 ;; void sha1_block_avx(void *input_data, UINT32 digest[5])
250 ;; arg 1 : (in) pointer to input data
251 ;; arg 2 : (in/out) pointer to read/write digest
252 MKGLOBAL(sha1_block_avx,function,internal)
253 align 32
254 sha1_block_avx:
255 push rbx
256 push rsi
257 push rdi
258 push r12
259 push r13
260
261 vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
262
263 mov rax,rsp ; copy rsp
264 VMOVDQ X0, [INP + 0*16]
265 sub rsp,FRAMESZ
266 VMOVDQ X1, [INP + 1*16]
267 and rsp,-64 ; align stack frame
268 mov [_RSP],rax ; save copy of rsp
269
270 vmovdqa [rsp + 0 * 16], xmm6
271 vmovdqa [rsp + 1 * 16], xmm7
272 vmovdqa [rsp + 2 * 16], xmm8
273
274 ;; load next message block
275 VMOVDQ X2, [INP + 2*16]
276 VMOVDQ X3, [INP + 3*16]
277
278 ;; set up a-f based on h0-h4
279 ;; byte swap first 16 dwords
280 mov a, [SZ*0 + CTX]
281 vpshufb X0, XTMP0
282 mov b, [SZ*1 + CTX]
283 vpshufb X1, XTMP0
284 mov c, [SZ*2 + CTX]
285 vpshufb X2, XTMP0
286 mov d, [SZ*3 + CTX]
287 vpshufb X3, XTMP0
288 mov e, [SZ*4 + CTX]
289
290 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
291 ;; do rounds 00-19
292 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293 vmovdqa XK, [rel K00_19]
294 mov RND, 3
295 ROTATE_ARGS
296 ROTATE_ARGS
297 ROTATE_ARGS
298 ROTATE_ARGS
299 rotate_Xs
300 rotate_Xs
301 rotate_Xs
302 rotate_Xs
303 jmp loop1_5
304 align 16
305 loop1:
306
307 do_4i MAGIC_F0
308
309 loop1_5:
310 do_4i MAGIC_F0
311
312 rotate_Xs
313 rotate_Xs
314 rotate_Xs
315 rotate_Xs
316 vmovdqa X0, X2
317 vmovdqa X2, X4
318 vmovdqa X4, X1
319 vmovdqa X1, X3
320
321 sub RND, 1
322 jne loop1
323
324 rotate_Xs
325 rotate_Xs
326 rotate_Xs
327 rotate_Xs
328
329 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330 ;; end rounds 00-19
331 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
332
333 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
334 ;; do rounds 20-39
335 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336 vmovdqa XK, [rel K20_39]
337 mov RND, 3
338 ROTATE_ARGS
339 ROTATE_ARGS
340 ROTATE_ARGS
341 ROTATE_ARGS
342 rotate_Xs
343 rotate_Xs
344 rotate_Xs
345 rotate_Xs
346 jmp loop2_5
347 align 16
348 loop2:
349
350 do_4i MAGIC_F1
351
352 loop2_5:
353 do_4i MAGIC_F1
354
355 rotate_Xs
356 rotate_Xs
357 rotate_Xs
358 rotate_Xs
359 vmovdqa X0, X2
360 vmovdqa X2, X4
361 vmovdqa X4, X1
362 vmovdqa X1, X3
363
364 sub RND, 1
365 jne loop2
366
367 rotate_Xs
368 rotate_Xs
369 rotate_Xs
370 rotate_Xs
371
372 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
373 ;; end rounds 20-39
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375
376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
377 ;; do rounds 40-59
378 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
379 vmovdqa XK, [rel K40_59]
380 mov RND, 3
381 ROTATE_ARGS
382 ROTATE_ARGS
383 ROTATE_ARGS
384 ROTATE_ARGS
385 rotate_Xs
386 rotate_Xs
387 rotate_Xs
388 rotate_Xs
389 jmp loop3_5
390 align 16
391 loop3:
392
393 do_4i MAGIC_F2
394
395 loop3_5:
396 do_4i MAGIC_F2
397
398 rotate_Xs
399 rotate_Xs
400 rotate_Xs
401 rotate_Xs
402 vmovdqa X0, X2
403 vmovdqa X2, X4
404 vmovdqa X4, X1
405 vmovdqa X1, X3
406
407 sub RND, 1
408 jne loop3
409
410 rotate_Xs
411 rotate_Xs
412 rotate_Xs
413 rotate_Xs
414
415 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
416 ;; end rounds 40-59
417 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
418
419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
420 ;; do rounds 60-79
421 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
422 vmovdqa XK, [rel K60_79]
423
424 do_4i MAGIC_F3
425
426 vpaddd XFER, XK, X0
427 vpextrd T1, XFER, 0
428 ROUND MAGIC_F3
429 vpextrd T1, XFER, 1
430 ROUND MAGIC_F3
431 vpextrd T1, XFER, 2
432 ROUND MAGIC_F3
433 vpextrd T1, XFER, 3
434 ROUND MAGIC_F3
435
436 vpaddd XFER, XK, X1
437 vpextrd T1, XFER, 0
438 ROUND MAGIC_F3
439 vpextrd T1, XFER, 1
440 ROUND MAGIC_F3
441 vpextrd T1, XFER, 2
442 ROUND MAGIC_F3
443 vpextrd T1, XFER, 3
444 ROUND MAGIC_F3
445
446 vpaddd XFER, XK, X2
447 vpextrd T1, XFER, 0
448 ROUND MAGIC_F3
449 vpextrd T1, XFER, 1
450 ROUND MAGIC_F3
451 vpextrd T1, XFER, 2
452 ROUND MAGIC_F3
453 vpextrd T1, XFER, 3
454 ROUND MAGIC_F3
455
456 vpaddd XFER, XK, X3
457 vpextrd T1, XFER, 0
458 ROUND MAGIC_F3
459 vpextrd T1, XFER, 1
460 ROUND MAGIC_F3
461 vpextrd T1, XFER, 2
462 ROUND MAGIC_F3
463 vpextrd T1, XFER, 3
464 ROUND MAGIC_F3
465
466 ;; update result digest h0-h4
467 add [SZ*0 + CTX], a
468 add [SZ*1 + CTX], b
469 add [SZ*2 + CTX], c
470 add [SZ*3 + CTX], d
471 add [SZ*4 + CTX], e
472
473 vmovdqa xmm8, [rsp + 2 * 16]
474 vmovdqa xmm7, [rsp + 1 * 16]
475 vmovdqa xmm6, [rsp + 0 * 16]
476
477 mov rsp,[_RSP]
478
479 pop r13
480 pop r12
481 pop rdi
482 pop rsi
483 pop rbx
484
485 ret
486
487 %ifdef LINUX
488 section .note.GNU-stack noalloc noexec nowrite progbits
489 %endif