]>
Commit | Line | Data |
---|---|---|
11fdf7f2 | 1 | ;; |
f67539c2 | 2 | ;; Copyright (c) 2012-2019, Intel Corporation |
11fdf7f2 TL |
3 | ;; |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
f67539c2 TL |
28 | %include "include/os.asm" |
29 | %include "include/memcpy.asm" | |
30 | %include "include/const.inc" | |
11fdf7f2 TL |
31 | |
32 | ; routine to do AES256 CNTR enc/decrypt "by4" | |
33 | ; XMM registers are clobbered. Saving/restoring must be done at a higher level | |
34 | ||
9f95a23c TL |
35 | %ifndef AES_CNTR_256 |
36 | %define AES_CNTR_256 aes_cntr_256_sse | |
f67539c2 | 37 | %define AES_CNTR_BIT_256 aes_cntr_bit_256_sse |
9f95a23c TL |
38 | %endif |
39 | ||
11fdf7f2 TL |
40 | extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 |
41 | ||
42 | %define CONCAT(a,b) a %+ b | |
43 | %define MOVDQ movdqu | |
44 | ||
45 | %define xdata0 xmm0 | |
46 | %define xdata1 xmm1 | |
f67539c2 | 47 | %define xpart xmm1 |
11fdf7f2 TL |
48 | %define xdata2 xmm2 |
49 | %define xdata3 xmm3 | |
50 | %define xdata4 xmm4 | |
51 | %define xdata5 xmm5 | |
52 | %define xdata6 xmm6 | |
53 | %define xdata7 xmm7 | |
54 | %define xcounter xmm8 | |
f67539c2 | 55 | %define xtmp xmm8 |
11fdf7f2 | 56 | %define xbyteswap xmm9 |
f67539c2 | 57 | %define xtmp2 xmm9 |
11fdf7f2 | 58 | %define xkey0 xmm10 |
f67539c2 | 59 | %define xtmp3 xmm10 |
11fdf7f2 TL |
60 | %define xkey4 xmm11 |
61 | %define xkey8 xmm12 | |
62 | %define xkey12 xmm13 | |
63 | %define xkeyA xmm14 | |
64 | %define xkeyB xmm15 | |
65 | ||
66 | %ifdef LINUX | |
67 | %define p_in rdi | |
68 | %define p_IV rsi | |
69 | %define p_keys rdx | |
70 | %define p_out rcx | |
71 | %define num_bytes r8 | |
f67539c2 | 72 | %define num_bits r8 |
11fdf7f2 TL |
73 | %define p_ivlen r9 |
74 | %else | |
75 | %define p_in rcx | |
76 | %define p_IV rdx | |
77 | %define p_keys r8 | |
78 | %define p_out r9 | |
79 | %define num_bytes r10 | |
f67539c2 | 80 | %define num_bits r10 |
11fdf7f2 TL |
81 | %define p_ivlen qword [rsp + 8*6] |
82 | %endif | |
83 | ||
84 | %define tmp r11 | |
11fdf7f2 | 85 | |
f67539c2 TL |
86 | %define r_bits r12 |
87 | %define tmp2 r13 | |
88 | %define mask r14 | |
89 | ||
90 | %macro do_aes_load 2 | |
91 | do_aes %1, %2, 1 | |
11fdf7f2 TL |
92 | %endmacro |
93 | ||
f67539c2 TL |
94 | %macro do_aes_noload 2 |
95 | do_aes %1, %2, 0 | |
11fdf7f2 TL |
96 | %endmacro |
97 | ||
98 | ||
99 | ; do_aes num_in_par load_keys | |
100 | ; This increments p_in, but not p_out | |
f67539c2 | 101 | %macro do_aes 3 |
11fdf7f2 | 102 | %define %%by %1 |
f67539c2 TL |
103 | %define %%cntr_type %2 |
104 | %define %%load_keys %3 | |
11fdf7f2 TL |
105 | |
106 | %if (%%load_keys) | |
107 | movdqa xkey0, [p_keys + 0*16] | |
108 | %endif | |
109 | ||
110 | movdqa xdata0, xcounter | |
111 | pshufb xdata0, xbyteswap | |
112 | %assign i 1 | |
113 | %rep (%%by - 1) | |
114 | movdqa CONCAT(xdata,i), xcounter | |
115 | paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] | |
116 | pshufb CONCAT(xdata,i), xbyteswap | |
117 | %assign i (i + 1) | |
118 | %endrep | |
119 | ||
120 | movdqa xkeyA, [p_keys + 1*16] | |
121 | ||
122 | pxor xdata0, xkey0 | |
f67539c2 TL |
123 | %ifidn %%cntr_type, CNTR_BIT |
124 | paddq xcounter, [rel CONCAT(ddq_add_,%%by)] | |
125 | %else | |
11fdf7f2 | 126 | paddd xcounter, [rel CONCAT(ddq_add_,%%by)] |
f67539c2 TL |
127 | %endif |
128 | ||
11fdf7f2 TL |
129 | %assign i 1 |
130 | %rep (%%by - 1) | |
131 | pxor CONCAT(xdata,i), xkey0 | |
132 | %assign i (i + 1) | |
133 | %endrep | |
134 | ||
135 | movdqa xkeyB, [p_keys + 2*16] | |
136 | %assign i 0 | |
137 | %rep %%by | |
138 | aesenc CONCAT(xdata,i), xkeyA ; key 1 | |
139 | %assign i (i+1) | |
140 | %endrep | |
141 | ||
142 | movdqa xkeyA, [p_keys + 3*16] | |
143 | %assign i 0 | |
144 | %rep %%by | |
145 | aesenc CONCAT(xdata,i), xkeyB ; key 2 | |
146 | %assign i (i+1) | |
147 | %endrep | |
148 | ||
149 | add p_in, 16*%%by | |
150 | ||
151 | %if (%%load_keys) | |
152 | movdqa xkey4, [p_keys + 4*16] | |
153 | %endif | |
154 | %assign i 0 | |
155 | %rep %%by | |
156 | aesenc CONCAT(xdata,i), xkeyA ; key 3 | |
157 | %assign i (i+1) | |
158 | %endrep | |
159 | ||
160 | movdqa xkeyA, [p_keys + 5*16] | |
161 | %assign i 0 | |
162 | %rep %%by | |
163 | aesenc CONCAT(xdata,i), xkey4 ; key 4 | |
164 | %assign i (i+1) | |
165 | %endrep | |
166 | ||
167 | movdqa xkeyB, [p_keys + 6*16] | |
168 | %assign i 0 | |
169 | %rep %%by | |
170 | aesenc CONCAT(xdata,i), xkeyA ; key 5 | |
171 | %assign i (i+1) | |
172 | %endrep | |
173 | ||
174 | movdqa xkeyA, [p_keys + 7*16] | |
175 | %assign i 0 | |
176 | %rep %%by | |
177 | aesenc CONCAT(xdata,i), xkeyB ; key 6 | |
178 | %assign i (i+1) | |
179 | %endrep | |
180 | ||
181 | %if (%%load_keys) | |
182 | movdqa xkey8, [p_keys + 8*16] | |
183 | %endif | |
184 | %assign i 0 | |
185 | %rep %%by | |
186 | aesenc CONCAT(xdata,i), xkeyA ; key 7 | |
187 | %assign i (i+1) | |
188 | %endrep | |
189 | ||
190 | movdqa xkeyA, [p_keys + 9*16] | |
191 | %assign i 0 | |
192 | %rep %%by | |
193 | aesenc CONCAT(xdata,i), xkey8 ; key 8 | |
194 | %assign i (i+1) | |
195 | %endrep | |
196 | ||
197 | movdqa xkeyB, [p_keys + 10*16] | |
198 | %assign i 0 | |
199 | %rep %%by | |
200 | aesenc CONCAT(xdata,i), xkeyA ; key 9 | |
201 | %assign i (i+1) | |
202 | %endrep | |
203 | ||
204 | movdqa xkeyA, [p_keys + 11*16] | |
205 | %assign i 0 | |
206 | %rep %%by | |
207 | aesenc CONCAT(xdata,i), xkeyB ; key 10 | |
208 | %assign i (i+1) | |
209 | %endrep | |
210 | ||
211 | %if (%%load_keys) | |
212 | movdqa xkey12, [p_keys + 12*16] | |
213 | %endif | |
214 | %assign i 0 | |
215 | %rep %%by | |
216 | aesenc CONCAT(xdata,i), xkeyA ; key 11 | |
217 | %assign i (i+1) | |
218 | %endrep | |
219 | ||
220 | movdqa xkeyA, [p_keys + 13*16] | |
221 | %assign i 0 | |
222 | %rep %%by | |
223 | aesenc CONCAT(xdata,i), xkey12 ; key 12 | |
224 | %assign i (i+1) | |
225 | %endrep | |
226 | ||
227 | movdqa xkeyB, [p_keys + 14*16] | |
228 | %assign i 0 | |
229 | %rep %%by | |
230 | aesenc CONCAT(xdata,i), xkeyA ; key 13 | |
231 | %assign i (i+1) | |
232 | %endrep | |
233 | ||
234 | %assign i 0 | |
235 | %rep %%by | |
236 | aesenclast CONCAT(xdata,i), xkeyB ; key 14 | |
237 | %assign i (i+1) | |
238 | %endrep | |
239 | ||
240 | %assign i 0 | |
241 | %rep (%%by / 2) | |
242 | %assign j (i+1) | |
243 | MOVDQ xkeyA, [p_in + i*16 - 16*%%by] | |
244 | MOVDQ xkeyB, [p_in + j*16 - 16*%%by] | |
245 | pxor CONCAT(xdata,i), xkeyA | |
246 | pxor CONCAT(xdata,j), xkeyB | |
247 | %assign i (i+2) | |
248 | %endrep | |
249 | %if (i < %%by) | |
250 | MOVDQ xkeyA, [p_in + i*16 - 16*%%by] | |
251 | pxor CONCAT(xdata,i), xkeyA | |
252 | %endif | |
253 | ||
f67539c2 TL |
254 | %ifidn %%cntr_type, CNTR_BIT |
255 | ;; check if this is the end of the message | |
256 | mov tmp, num_bytes | |
257 | and tmp, ~(%%by*16) | |
258 | jnz %%skip_preserve | |
259 | ;; Check if there is a partial byte | |
260 | or r_bits, r_bits | |
261 | jz %%skip_preserve | |
262 | ||
263 | %assign idx (%%by - 1) | |
264 | ;; Load output to get last partial byte | |
265 | movdqu xtmp, [p_out + idx * 16] | |
266 | ||
267 | ;; Save RCX in temporary GP register | |
268 | mov tmp, rcx | |
269 | mov mask, 0xff | |
270 | mov cl, BYTE(r_bits) | |
271 | shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 | |
272 | mov rcx, tmp | |
273 | ||
274 | movq xtmp2, mask | |
275 | pslldq xtmp2, 15 | |
276 | ;; At this point, xtmp2 contains a mask with all 0s, but with some ones | |
277 | ;; in the partial byte | |
278 | ||
279 | ;; Clear all the bits that do not need to be preserved from the output | |
280 | pand xtmp, xtmp2 | |
281 | ||
282 | ;; Clear all bits from the input that are not to be ciphered | |
283 | pandn xtmp2, CONCAT(xdata, idx) | |
284 | por xtmp2, xtmp | |
285 | movdqa CONCAT(xdata, idx), xtmp2 | |
286 | ||
287 | %%skip_preserve: | |
288 | %endif | |
289 | ||
11fdf7f2 TL |
290 | %assign i 0 |
291 | %rep %%by | |
292 | MOVDQ [p_out + i*16], CONCAT(xdata,i) | |
293 | %assign i (i+1) | |
294 | %endrep | |
295 | %endmacro | |
296 | ||
11fdf7f2 TL |
297 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
298 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
299 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
300 | section .text | |
301 | ||
f67539c2 TL |
302 | ;; Macro performing AES-CTR. |
303 | ;; | |
304 | %macro DO_CNTR 1 | |
305 | %define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) | |
11fdf7f2 TL |
306 | |
307 | %ifndef LINUX | |
308 | mov num_bytes, [rsp + 8*5] | |
309 | %endif | |
310 | ||
f67539c2 TL |
311 | %ifidn %%CNTR_TYPE, CNTR_BIT |
312 | push r12 | |
313 | push r13 | |
314 | push r14 | |
315 | %endif | |
316 | ||
11fdf7f2 | 317 | movdqa xbyteswap, [rel byteswap_const] |
f67539c2 | 318 | %ifidn %%CNTR_TYPE, CNTR |
11fdf7f2 | 319 | test p_ivlen, 16 |
f67539c2 | 320 | jnz %%iv_is_16_bytes |
11fdf7f2 TL |
321 | ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 |
322 | mov DWORD(tmp), 0x01000000 | |
323 | pinsrq xcounter, [p_IV], 0 | |
324 | pinsrd xcounter, [p_IV + 8], 2 | |
325 | pinsrd xcounter, DWORD(tmp), 3 | |
f67539c2 TL |
326 | |
327 | %else ;; CNTR_BIT | |
328 | ; Read 16 byte IV: Nonce + 8-byte block counter (BE) | |
329 | movdqu xcounter, [p_IV] | |
330 | %endif | |
331 | ||
332 | %%bswap_iv: | |
11fdf7f2 TL |
333 | pshufb xcounter, xbyteswap |
334 | ||
f67539c2 TL |
335 | ;; calculate len |
336 | ;; convert bits to bytes (message length in bits for CNTR_BIT) | |
337 | %ifidn %%CNTR_TYPE, CNTR_BIT | |
338 | mov r_bits, num_bits | |
339 | add num_bits, 7 | |
340 | shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same | |
341 | and r_bits, 7 ; Check if there are remainder bits (0-7) | |
342 | %endif | |
11fdf7f2 TL |
343 | mov tmp, num_bytes |
344 | and tmp, 3*16 | |
f67539c2 | 345 | jz %%chk ; x4 > or < 15 (not 3 lines) |
11fdf7f2 TL |
346 | |
347 | ; 1 <= tmp <= 3 | |
348 | cmp tmp, 2*16 | |
f67539c2 TL |
349 | jg %%eq3 |
350 | je %%eq2 | |
351 | %%eq1: | |
352 | do_aes_load 1, %%CNTR_TYPE | |
11fdf7f2 | 353 | add p_out, 1*16 |
f67539c2 | 354 | jmp %%chk |
11fdf7f2 | 355 | |
f67539c2 TL |
356 | %%eq2: |
357 | do_aes_load 2, %%CNTR_TYPE | |
11fdf7f2 | 358 | add p_out, 2*16 |
f67539c2 | 359 | jmp %%chk |
11fdf7f2 | 360 | |
f67539c2 TL |
361 | %%eq3: |
362 | do_aes_load 3, %%CNTR_TYPE | |
11fdf7f2 TL |
363 | add p_out, 3*16 |
364 | ; fall through to chk | |
f67539c2 TL |
365 | %%chk: |
366 | and num_bytes, ~(3*16) | |
367 | jz %%do_return2 | |
368 | ||
11fdf7f2 | 369 | cmp num_bytes, 16 |
f67539c2 | 370 | jb %%last |
11fdf7f2 TL |
371 | |
372 | ; process multiples of 4 blocks | |
373 | movdqa xkey0, [p_keys + 0*16] | |
374 | movdqa xkey4, [p_keys + 4*16] | |
375 | movdqa xkey8, [p_keys + 8*16] | |
376 | movdqa xkey12, [p_keys + 12*16] | |
11fdf7f2 TL |
377 | |
378 | align 32 | |
f67539c2 TL |
379 | %%main_loop2: |
380 | ; num_bytes is a multiple of 4 blocks + partial bytes | |
381 | do_aes_noload 4, %%CNTR_TYPE | |
11fdf7f2 TL |
382 | add p_out, 4*16 |
383 | sub num_bytes, 4*16 | |
384 | cmp num_bytes, 4*16 | |
f67539c2 | 385 | jae %%main_loop2 |
11fdf7f2 | 386 | |
f67539c2 TL |
387 | ; Check if there is a partial block |
388 | or num_bytes, num_bytes | |
389 | jnz %%last | |
390 | ||
391 | %%do_return2: | |
392 | ||
393 | %ifidn %%CNTR_TYPE, CNTR_BIT | |
394 | pop r14 | |
395 | pop r13 | |
396 | pop r12 | |
397 | %endif | |
11fdf7f2 | 398 | |
11fdf7f2 TL |
399 | ret |
400 | ||
f67539c2 TL |
401 | %%last: |
402 | ||
403 | ; load partial block into XMM register | |
404 | simd_load_sse_15_1 xpart, p_in, num_bytes | |
405 | ||
406 | %%final_ctr_enc: | |
407 | ; Encryption of a single partial block | |
408 | pshufb xcounter, xbyteswap | |
409 | movdqa xdata0, xcounter | |
410 | pxor xdata0, [p_keys + 16*0] | |
11fdf7f2 TL |
411 | %assign i 1 |
412 | %rep 13 | |
413 | aesenc xdata0, [p_keys + 16*i] | |
414 | %assign i (i+1) | |
415 | %endrep | |
416 | ; created keystream | |
417 | aesenclast xdata0, [p_keys + 16*i] | |
f67539c2 | 418 | |
11fdf7f2 | 419 | ; xor keystream with the message (scratch) |
f67539c2 TL |
420 | pxor xdata0, xpart |
421 | ||
422 | %ifidn %%CNTR_TYPE, CNTR_BIT | |
423 | ;; Check if there is a partial byte | |
424 | or r_bits, r_bits | |
425 | jz %%store_output | |
426 | ||
427 | ;; Load output to get last partial byte | |
428 | simd_load_sse_15_1 xtmp, p_out, num_bytes | |
429 | ||
430 | ;; Save RCX in temporary GP register | |
431 | mov tmp, rcx | |
432 | mov mask, 0xff | |
433 | %ifidn r_bits, rcx | |
434 | %error "r_bits cannot be mapped to rcx!" | |
435 | %endif | |
436 | mov cl, BYTE(r_bits) | |
437 | shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 | |
438 | mov rcx, tmp | |
439 | ||
440 | movq xtmp2, mask | |
441 | ||
442 | ;; Get number of full bytes in last block of 16 bytes | |
443 | mov tmp, num_bytes | |
444 | dec tmp | |
445 | XPSLLB xtmp2, tmp, xtmp3, tmp2 | |
446 | ;; At this point, xtmp2 contains a mask with all 0s, but with some ones | |
447 | ;; in the partial byte | |
448 | ||
449 | ;; Clear all the bits that do not need to be preserved from the output | |
450 | pand xtmp, xtmp2 | |
451 | ||
452 | ;; Clear the bits from the input that are not to be ciphered | |
453 | pandn xtmp2, xdata0 | |
454 | por xtmp2, xtmp | |
455 | movdqa xdata0, xtmp2 | |
456 | %endif | |
457 | ||
458 | %%store_output: | |
459 | ; copy result into the output buffer | |
460 | simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax | |
461 | ||
462 | jmp %%do_return2 | |
463 | ||
464 | %%iv_is_16_bytes: | |
11fdf7f2 TL |
465 | ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) |
466 | movdqu xcounter, [p_IV] | |
f67539c2 TL |
467 | jmp %%bswap_iv |
468 | %endmacro | |
469 | ||
470 | align 32 | |
471 | ;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) | |
472 | MKGLOBAL(AES_CNTR_256,function,internal) | |
473 | AES_CNTR_256: | |
474 | DO_CNTR CNTR | |
475 | ||
476 | ;; aes_cntr_bit_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len) | |
477 | MKGLOBAL(AES_CNTR_BIT_256,function,internal) | |
478 | AES_CNTR_BIT_256: | |
479 | DO_CNTR CNTR_BIT | |
11fdf7f2 TL |
480 | |
481 | %ifdef LINUX | |
482 | section .note.GNU-stack noalloc noexec nowrite progbits | |
483 | %endif |